asked on

Extended ascii characters reading problem

Hi ,

I have a program that read an unicode file and all ascii and unicode characters are well reading with fread and stored in memory in a variable wchar_t* file_buffer then I do a parsing and store datas in a struct.

The problem begin when I puted extended ascii characters in the unicode file, then the contain of file_buffer was fill till the ascii characters is read. It seems like an EOF ?

So the problem seems to come from fread function...
normally extended ascii are stored in unsigned char, and wchar_t are
normally unsigned basically isn't it ?

Can you check the code and capture with and without extended ascii?
Thank you

pgnatyuk

I'd recommend to not make salads and if the file is Unicode save all data in it in Unicode format.
Any text you are going to save, convert to Unicode. In your previous threads you can find all functions needed for that (MultiByteToWideChar).
You will see yourself how fine it will work.

pgnatyuk

How I remember your projects support Unicode character set and you do use MFC.
If I'm right, I recommend you again to use CString class that will make all converting automatically.

Develprog

ASKER

Sorry I forgetted the code :)

here you are the code and capture
I put too part of code that check (parse) this extended ascii character

the contain of file is :
-------------------------------------------

#TITLE MENU
18454QCancitQLogina
18455QNextit , et concor g parcours , etQLoginb

#TITLE EXTRA
18456QPrevitQLoginc
18457Q Previtia QLogind
---------------------------------------------------

the extended characters is the 'Q ' 186 (0xBA)

I enclose to the good capture with normal pipe character ( 0x7C ) instead of Q

Thank you


public:	//variables
	
	wchar_t* file_buffer; // file data
	long lSize;

long LoadAndParsing (char* file_name)
{
	FILE* streaminit=NULL;
	int file_manip=1;
	int fByte[2];
	int ret_err=0;
	.....

	if((streaminit = fopen(file_name, "rb"))==NULL)
	{                            
		return -1;                                                                
	} 
	
	fseek (streaminit , 0 , SEEK_END);
	lSize = ftell (streaminit);
	file_manip= fseek(streaminit,0L,0);
	
	if(file_buffer!=NULL) { free(file_buffer); }
		file_buffer =(wchar_t*) calloc ( lSize+100 , sizeof (wchar_t));
	
	file_manip=fseek(streaminit, 2L, SEEK_SET); //to avoid FF FE
	fread (file_buffer,1,lSize,streaminit);
	fclose(streaminit);

	//display file
	printf("\n<Contain of file_buffer>\n");
	wprintf(L"%ls",file_buffer); 
	printf("\n</Contain of file_buffer>\n\n");		
	//PARSING CODE
	int cnt_field=0;
	int count_little=0;
	bool start_of_line =true;
	bool front_of_line=false;
	bool with_comment=false;
	for(unsigned int count_char =0; count_char<(unsigned int)lSize; count_char++)
	{
		count_little++;
		if((file_buffer[count_char]==0x23)) // #
		{
			while(file_buffer[count_char]!=0x0A)
			{
				count_char++;
			}
			cnt_field =0;
			count_little=0;
			file_buffer[count_char] =0;
			start_of_line=true; 
			with_comment=true;
		}
		if((file_buffer[count_char]==0xBA)&& ((count_little==0)||(count_little==1))) // minimum id_number character is 1 //2C (comma) => 7C (pipe) => BA ((double pipe extended ascii)
		{
			while(file_buffer[count_char]!=0x0A)
			{
				count_char++;
			}
			cnt_field =0;
			file_buffer[count_char] =0;
			count_little=0;
			start_of_line=true;
			with_comment=true;
		}
		else if((file_buffer[count_char]==0xBA)) // xxxx,xxxx //2C (comma) => 7C (pipe) => BA (double pipe extended ascii)
		{
			file_buffer[count_char] =0;
			
			if(cnt_field==0)
			{
				gl_arr[cnt_line].txt_translate = &file_buffer[count_char+1]; //

			}
			if(cnt_field==1)
			{
				file_buffer[count_char] =0;//

				gl_arr[cnt_line].txt_comment = &file_buffer[count_char+1]; //REVISITED FOR HAVING ONE STRING " " :+2 = > +1 

				printf("2) count_char=%d \n", count_char);
			}
			++cnt_field;
			start_of_line=false; 
		}
		
		if((start_of_line==true)&&((count_little==0)||(count_little==1)||(count_little==2))) // first id_number
		{
			cnt_field =0;
			
			if(with_comment==true)
			{
				gl_arr[cnt_line].id = &file_buffer[count_char+1];
			}
			else
			{
				gl_arr[cnt_line].id = &file_buffer[count_char];
			}
			start_of_line=false;
		}
		else if((front_of_line==true)&&((count_little==0)||(count_little==1))) // id_number
		{
			cnt_field =0;
			gl_arr[cnt_line].id = &file_buffer[count_char];
			front_of_line=false;
		}
		
		if((file_buffer[count_char]==0x0A)&&(!(count_little<=2))) // LF
		{
			cnt_field =0;
			++cnt_line;

			file_buffer[count_char-1] =0; 
			file_buffer[count_char] =0;

			gl_arr[cnt_line].id = &file_buffer[count_char+1];

			count_little=0 ; 
			front_of_line=true;
			printf("0) count_char=%d\n", count_char);
			printf("\n");
		}
		else if((file_buffer[count_char]==0x0A)) // LF
		{
			cnt_field =0;
			count_little=0 ; 
			front_of_line=true;
		}
		
		if((file_buffer[count_char]==0x09)&&(count_little<=2)) // TAB
		{
			while(file_buffer[count_char]!=0x0A)
			{
				count_char++;
			}
			file_buffer[count_char] =0;
			cnt_field =0;
			count_little=0 ; 
			front_of_line=true;
		}
	}
	//display
	wprintf(L"counted line(s) is=%d\n", cnt_line);
	
	printf("Display array of struct in C-style\n");
	printf("-------------\n");
	
	for(int line=0;line<cnt_line;++line)
	{	
		wprintf(L"  ");
		wprintf(L"	id=%-8s ",  gl_arr[line].id);
		wprintf(L"	txt=%-10s ", gl_arr[line].txt_translate);
		wprintf(L"	cmt=%-10s ", gl_arr[line].txt_comment);
		wprintf(L"\n");
	}
	printf("-------------\n");
	
	return ret_err;
}

Open in new window

capture-with-ascii-extended-file.JPG
capture-with-ascii-normal-file.JPG

Develprog

ASKER

Yes,

It is ok for parsing only by changing

file_buffer[count_char]==0xBA => file_buffer[count_char]==0x2551

but why?

the result of parsing it the same than normal ascii file.

The only small issue for now is still the display of the contain of file when extended ascii character appears :

the wprintf(L"%ls",file_buffer); can't display the extended ascii.

How fix it ?

Thank you

Let_Me_Be

It won't display it because you are mixing encodings.

Extended ASCII is in come 8bit encoding (there are tons of encodings). The wprintf (and any wide characters) are expecting an internal encoding (you are using Windows, therefore its UTF-16, UCS-2 for Windows 2000 and older, on other systems it is UTF-32).

Therefore wprintf simply won't display anything because there are no symbols with the values you are passing to wprintf (or there are, but have different meaning).

pgnatyuk

shortly, there is no problem with the file reading.
You do not see the file content because wprintf expects to receive the Unicode text.
So if you want to work with the Unicode data file, convert everything you are going to store there to Unicode.
Now you can name your file a binary data file. Now it is not a Unicode text file.

Develprog

ASKER

Yes,

For sure with the hex edit I can see sequence with '00' for each unicode characters
but the extented ascii that I added in the file doesn't have that, so this

string : " 18454QCancit "

is in HEX : 31 00 38 00 34 00 35 00 34 00 51 25 43 00 61 00 6E 00 63 00 69 00 74 00

What I did is ALT + 186 , the result is Q
but with ALT + 0 + 186 , the result is º but it is unicode

Better is to look for a similar from this ascii extended character in unicode type and use ALT + 0 + code for the character

Btw it will better I think isn'it?

Thank you

Develprog

ASKER

Ok

You are right it is not a reading problem,

But I thought that extended ascii was in 1 byte because of when we look at the ASCII TABLE, the int value of this character is 186 and it is BA in hex.

so why I can't compare with 'BA' and I by using this instead of it works well:

//(file_buffer[count_char]==0xBA)
(file_buffer[count_char]==0x2551)

In the same time when we look at UNICODE TABLE whe can see that
this character is well unicode (http://fr.wikipedia.org/wiki/Table_des_caract%C3%A8res_Unicode_(2000-2FFF))

Is Extended Ascii, unicode ? If so why the wprintf doesn't display datas ?

Thank you

ASKER CERTIFIED SOLUTION

pgnatyuk

membership

This solution is only available to members.

To access this solution, you must be a member of Experts Exchange.

Start Free Trial

Develprog

ASKER

So like you said the problem was not in reading but in displaying. Btw it brengs me more knowledge in unicode ( wprintf, setlocal ).

Thank you

Develprog

ASKER

Ok,

So like you said it was datas encoding problem.

With setlocal wprintf can display well the datas.

So the question is closed.

Thank you

pgnatyuk

You are welcome