plq
asked on
How to write the byte order markers at the start of a UNICODE file
I've got this function which writes text to a file...
int writetextfile(TCHAR * sfile, TCHAR * sbuffer)
{
DWORD dwBytesWritten;
SetErrorMode(SEM_NOOPENFIL EERRORBOX | SEM_FAILCRITICALERRORS);
// fix for cust xxx - files having content after eof
// old hFile = CreateFile(outputfile, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
HANDLE hFile = CreateFile(sfile, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, TRUNCATE_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (hFile == INVALID_HANDLE_VALUE)
{
hFile = CreateFile(sfile, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, CREATE_NEW, FILE_ATTRIBUTE_NORMAL, NULL);
}
if (hFile != INVALID_HANDLE_VALUE)
{
WriteFile(hFile, sbuffer, (_tcslen(sbuffer) + 1) * sizeof(TCHAR), &dwBytesWritten, NULL);
SetEndOfFile(hFile);
CloseHandle(hFile);
return dwBytesWritten;
}
else
{
return 0;
}
}
I also need to write the marker characters.
Without these I'm finding that vb6 fso can read the file OK, because you can tell it its unicode, but when the file is read from vb.net 2005, the byte order marks arent there so .net misinterprets the file format as ascii, and I end up with a "x 0 x 0 x 0" string that breaks the code when you do anything with it.
All our stuff is UTF8, so I think the bytes should be EF BB BF. What the easiest way to modify the above to include them ?
my C is a tad rusty and I need to get this working asap !
thanks
int writetextfile(TCHAR * sfile, TCHAR * sbuffer)
{
DWORD dwBytesWritten;
SetErrorMode(SEM_NOOPENFIL
// fix for cust xxx - files having content after eof
// old hFile = CreateFile(outputfile, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
HANDLE hFile = CreateFile(sfile, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, TRUNCATE_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (hFile == INVALID_HANDLE_VALUE)
{
hFile = CreateFile(sfile, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, CREATE_NEW, FILE_ATTRIBUTE_NORMAL, NULL);
}
if (hFile != INVALID_HANDLE_VALUE)
{
WriteFile(hFile, sbuffer, (_tcslen(sbuffer) + 1) * sizeof(TCHAR), &dwBytesWritten, NULL);
SetEndOfFile(hFile);
CloseHandle(hFile);
return dwBytesWritten;
}
else
{
return 0;
}
}
I also need to write the marker characters.
Without these I'm finding that vb6 fso can read the file OK, because you can tell it its unicode, but when the file is read from vb.net 2005, the byte order marks arent there so .net misinterprets the file format as ascii, and I end up with a "x 0 x 0 x 0" string that breaks the code when you do anything with it.
All our stuff is UTF8, so I think the bytes should be EF BB BF. What the easiest way to modify the above to include them ?
my C is a tad rusty and I need to get this working asap !
thanks
I didn't understand exactly where do you want to add three bytes - to the beginning or to the end. This code writes them in the beginning. If you want to add to the end - move WriteFile(hFile, buffer...) line after WriteFile(hFile, sbuffer...) line.
ASKER
Thanks Alex. Is there any way we can tune this to only write once ?
I started writing this...
#ifdef UNICODE
// Place the unicode utf8 byte order marks at the start of the file
TCHAR * sbufferbase = createstring(12, lcharsneeded + 3); // freed
*sbufferbase = 0xEF;
*(sbufferbase + 1) = 0xBB;
*(sbufferbase + 2) = 0xBF;
TCHAR * sbuffer = sbufferbase + 3;
#else
// no byte order marks
TCHAR * sbufferbase = createstring(12, lcharsneeded); // freed
TCHAR * sbuffer = sbufferbase;
#endif
// fill up sbuffer as before
...
writetextfile(spath, sbufferbase);
because I only want to call WriteFile once. This is a file being written to a heavily flogged share, so I want to minimise IO
My code above is obviously flawed because TCHAR is too wide and therefore putting 6 bytes at the front instead of 3.
Can you see an easy way of doing it which allows just one write ?
thanks
I started writing this...
#ifdef UNICODE
// Place the unicode utf8 byte order marks at the start of the file
TCHAR * sbufferbase = createstring(12, lcharsneeded + 3); // freed
*sbufferbase = 0xEF;
*(sbufferbase + 1) = 0xBB;
*(sbufferbase + 2) = 0xBF;
TCHAR * sbuffer = sbufferbase + 3;
#else
// no byte order marks
TCHAR * sbufferbase = createstring(12, lcharsneeded); // freed
TCHAR * sbuffer = sbufferbase;
#endif
// fill up sbuffer as before
...
writetextfile(spath, sbufferbase);
because I only want to call WriteFile once. This is a file being written to a heavily flogged share, so I want to minimise IO
My code above is obviously flawed because TCHAR is too wide and therefore putting 6 bytes at the front instead of 3.
Can you see an easy way of doing it which allows just one write ?
thanks
ASKER
BTW Unicode byte order marks sit at the beginning as far as I know..
ASKER
I have tried the UTF8 marker with Alex's suggestion above, and the file displays in notepad with a space between each character.
Notepad is capable of displaying unicode files so the format must be wrong.
I have tried the 3 bytes and the 3 bytes + one zero at the front of the file. The extra 0 in 4th place screw it up completely. With the 3 bytes it seems ok except the spacing
Notepad is capable of displaying unicode files so the format must be wrong.
I have tried the 3 bytes and the 3 bytes + one zero at the front of the file. The extra 0 in 4th place screw it up completely. With the 3 bytes it seems ok except the spacing
ASKER
Here's a working example
#include "globals.h"
#include "stdafx.h"
#include <stdlib.h>
#include <string.h>
#include <tchar.h>
#include <stdio.h>
#include <windows.h>
#include <malloc.h>
int _tmain(int argc, _TCHAR* argv[])
{
TCHAR * sfile = TEXT("C:\\ZX.TXT");
DWORD dwBytesWritten;
SetErrorMode(SEM_NOOPENFIL EERRORBOX | SEM_FAILCRITICALERRORS);
HANDLE hFile = CreateFile(sfile, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, TRUNCATE_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
TCHAR * swork = TEXT("HELLO リーズ&# 12301; 398;௙ 0; 话作出&# 20102; 453;");
size_t llen = _tcslen(swork);
size_t lbytes = llen * sizeof(TCHAR);
unsigned char smarker[3];
smarker[0] = 0xEF;
smarker[1] = 0xBB;
smarker[2] = 0xBF;
WriteFile(hFile, smarker, 3, &dwBytesWritten, NULL);
char *utf8 = (char *) malloc(lbytes * 4); //(llen + 1) * sizeof(TCHAR));
int lBytesWritten = WideCharToMultiByte(CP_UTF 8, 0, swork, -1, utf8, lbytes * 4, NULL, NULL);
// lBytesWritten includes the null
int err = GetLastError();
WriteFile(hFile, utf8, lBytesWritten, &dwBytesWritten, NULL);
free(utf8);
SetEndOfFile(hFile);
CloseHandle(hFile);
return 0;
}
#include "globals.h"
#include "stdafx.h"
#include <stdlib.h>
#include <string.h>
#include <tchar.h>
#include <stdio.h>
#include <windows.h>
#include <malloc.h>
int _tmain(int argc, _TCHAR* argv[])
{
TCHAR * sfile = TEXT("C:\\ZX.TXT");
DWORD dwBytesWritten;
SetErrorMode(SEM_NOOPENFIL
HANDLE hFile = CreateFile(sfile, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, TRUNCATE_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
TCHAR * swork = TEXT("HELLO リーズ&#
size_t llen = _tcslen(swork);
size_t lbytes = llen * sizeof(TCHAR);
unsigned char smarker[3];
smarker[0] = 0xEF;
smarker[1] = 0xBB;
smarker[2] = 0xBF;
WriteFile(hFile, smarker, 3, &dwBytesWritten, NULL);
char *utf8 = (char *) malloc(lbytes * 4); //(llen + 1) * sizeof(TCHAR));
int lBytesWritten = WideCharToMultiByte(CP_UTF
// lBytesWritten includes the null
int err = GetLastError();
WriteFile(hFile, utf8, lBytesWritten, &dwBytesWritten, NULL);
free(utf8);
SetEndOfFile(hFile);
CloseHandle(hFile);
return 0;
}
ASKER
The key was to translate the TCHAR from UNICODE to UTF8, as well as adding the byte order marks.
ASKER CERTIFIED SOLUTION
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
{
char buffer[3];
buffer[0] = 0xEF;
buffer[1] = 0xBB;
buffer[2] = 0xBF;
WriteFile(hFile, buffer, sizeof(buffer), &dwBytesWritten, NULL);
WriteFile(hFile, sbuffer, (_tcslen(sbuffer) + 1) * sizeof(TCHAR), &dwBytesWritten, NULL);
SetEndOfFile(hFile);
CloseHandle(hFile);
return dwBytesWritten;
}