Solved

Change to the struct

Posted on 2015-02-14
72
192 Views
Last Modified: 2015-03-05
Hi,
I have struct like

// struc3.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     int  get_len() const
     {
         int len = strlen(item1);
         if (len < (int)sizeof(item1))
             return len;
         return (int)sizeof(item1);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item1, len);
     }
     bool operator< (const struc3 & a2) const
     {
           if(strcmp(item1, a2.item1) < 0) return true;
           if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
           return false;
     }
};

#endif

Open in new window


here is the relevant project
// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include "..\..\include\struc3.h"   
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

using namespace std;
struc3 binrec;
int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long cnt_val = 0;
	std::ofstream files[80];
	std::ofstream file_w;
	std::set<struc3> records_w;
	std::ostringstream filename_w;
	filename_w << "c:\\dp4b\\flout_w.bin";
	std::string strfilename_w = filename_w.str();
	file_w.open(strfilename_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;
	for (int f = 0; f < 80; ++f)
	{
		std::set<struc3> records;
		srand((int)time(NULL));
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		std::string strfilename = filename.str();
		files[f].open(strfilename.c_str(), std::ios::binary | std::ios::out);
		if (!files[f].is_open()) return errno;
		for (cnt = 0; cnt<1000000; cnt++)
		{
			cnt_val++;
			try
			{
				struc3 val = { 0 };
				int j;
				for (j = 0; j<20; j++)
				{
					val.item1[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i1_int = cnt_val;
				strcpy(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				records.insert(val);
				//records_w.insert(val);
				if ((cnt + 1) % 10000 == 0)
				{
					std::cout << val.i1_int << " | " << val.item1 << std::endl;
				}
			}
			catch (exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}
		for (std::set<struc3>::iterator it = records.begin(); it != records.end(); ++it)
		{
			try
			{
				files[f].write((char *)&(*it), sizeof(struc3));
				//file_w.write((char *)&(*it), sizeof(struc3));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}

		records.clear();
		files[f].close();  //
	}
	std::ifstream inputfiles[80];
	struc3 names[80] = { 0 };
	bool eof_reached[80] = { false };
	long long num = 80;   //
	for (int f = 0; f < 80; ++f)
	{
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		inputfiles[f].open(filename.str().c_str(), std::ios::binary | std::ios::in);
		if (!inputfiles[f].is_open())
			return -3; //
		if (!inputfiles[f].read((char*)&names[f], sizeof(struc3)))
			return -4; //
	}

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 80; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].item1 < name_min)
			{
				name_min = names[n].item1;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		file_w.write((char*)&names[n_min], sizeof(struc3));
		if (!inputfiles[n_min].read((char*)&names[n_min], sizeof(struc3)))
		{
			eof_reached[n_min] = true;
			inputfiles[n_min].close();
		}
		if (!file_w.write((char*)&names[n_min], sizeof(struc3)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].item1 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  item1 = " << names[n_min].item1 << "std::endl";
		}
	}
	file_w.close();
	std::cout << "the maximum int value is " << integer_limit << std::endl;
	system("pause>null");
	return 0;
}

Open in new window


that is making use of the struct in above. To the struct, operator< is used for sorting purpose, supposing that I've also filled both item2 and item3 with values to the above codes, what to change to the above codes, if I sometimes, need to sort the file by item2 or item3?
0
Comment
Question by:HuaMinChen
  • 37
  • 32
  • 3
72 Comments
 
LVL 84

Expert Comment

by:ozo
ID: 40609814
bool operator< (const struc3 & a2) const
     {
          if(strcmp(item1, a2.item1) < 0) return true;
          if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
          if (i1_int > a2.i1_int) return false;
          if(strcmp(item2, a2.item2) < 0) return true;
          if(strcmp(item2, a2.item2) > 0) return false;
           if (i2_int < a2.i2_int) return true;
          if (i2_int > a2.i2_int) return false;
          if(strcmp(item3, a2.item3) < 0) return true;
          if(strcmp(item3, a2.item3) > 0) return false;
           if (i3_int < a2.i3_int) return true;
          //if (i3_int > a2.i3_int) return false;
          return false;
}
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40610576
Many thanks Ozo. Suppose that there are some items to the struct. Is there any way to each time, only choose one specific  item, like item1 and i1_int, or item2 and i2_int, to sort? I mean, once I've selected one specific item, like item1, item2, then the whole file should be sorted by the specific item.
0
 
LVL 84

Expert Comment

by:ozo
ID: 40611066
How are you making the selection?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40611271
you may use a functor for being able to use different sort criteria on a container:

struct threeitems 
{
    std::string item1;
    std::string item2;
    int item3;
    threeitems(std::string i1, std::string i2, int i3) : item1(i1), item2(i2), item3(i3) {}
};

enum esort { enone = 0, eitem1 = 1, eitem2=2, eitem3=3, emax };
struct sort_functor
{
    int      nsort;
    esort  sortorder[3];
    bool   bascending[3];
    sort_functor(int firstitem, int seconditem = enone, int thirditem = enone)
        :  nsort(0)
    {
        sortorder[0]    = sortorder[1]     = sortorder[2]    = enone;
        bascending[0] = bascending[1] = bascending[2] = true;

        if (abs(firstitem) > enone && abs(firstitem) < emax)
        {
            ++nsort;
            sortorder[0] = (esort)(abs(firstitem));
            bascending[0] = (firstitem > 0);
            if (abs(seconditem) > enone && abs(seconditem) < emax)
            {
                ++nsort;
                sortorder[1] = (esort)(abs(seconditem));
                bascending[1] = (seconditem > 0);
                if (abs(thirditem) > enone && abs(thirditem) < emax)
                {
                    ++nsort;
                    sortorder[2] = (esort)(abs(thirditem));
                    bascending[2] = (thirditem > 0);
                }
            }
        }
    }               
    bool operator()(const threeitems & ti1, const threeitems & ti2)
    {
        for (int i = 0; i < nsort; ++i)
        {
            bool bless = false;
            switch(sortorder[i])
            {
            case eitem1:
                {
                    
                    bless = (bascending[i] == true)? (ti1.item1 < ti2.item1) :  (ti2.item1 < ti1.item1) ;
                    break;
                }
            case eitem2:
                {
                    bless = (bascending[i] == true)? (ti1.item2 < ti2.item2) :  (ti2.item2 < ti1.item2) ;
                    break;
                }
            case eitem3:
                {
                    bless = (bascending[i] == true)? (ti1.item3 < ti2.item3) :  (ti2.item3 < ti1.item3) ;
                    break;
                }
            }
            if (bless == true)
                return true;
            switch(sortorder[i])
            {
            case eitem1:
                {
                    if (ti1.item1 == ti2.item1)
                        continue;
                    break;
                }
            case eitem2:
                {
                    if  (ti1.item2 == ti2.item2) 
                        continue;
                    break;
                }
            case eitem3:
                {
                    if (ti1.item3 == ti2.item3) 
                        continue;
                    break;
                }
            }
        }
        return false;
    }
};

void mysort()
{
    std::vector<threeitems> tiarr;
    tiarr.push_back(threeitems("abc", "xyz", 123));
    tiarr.push_back(threeitems("aaa", "bbb", 999));
    tiarr.push_back(threeitems("zzz", "yyy", 111));

    std::sort(tiarr.begin(), tiarr.end(), sort_functor(1, -2, -3));
}

Open in new window


the sample sorts the container by first item in ascending order, then (in case of duplicates) by second item in descending order and finally by third item in descending order.

the sort_functor works like a function pointer but has the advantage that you could parameterize it unlike to a function pointer. if you would use sort_functor(3) it would only use item3 in ascending order.

a few things you have to consider if you think on using a functor for the nameval structure:

- you currently were using a std::set and operator< of struct nameval. but std::set cannot be used with a functor.
   unlike to std::sort function the sort criteria is a property of the set and may not be a matter of change for the set.
- if you want to use a functor you should change the container to std::vector.
  then you would push_back the records to the vector unsorted and then sort it by using the functor.
  i would guess, it is slower than by using the std::set and the built-in less operator but i might be wrong.
- in no case you could 'sort a file' by that. you always have to sort in memory and then write a new file.
- because of that you cannot sort the huge file as it is too big to load it to memory.
- but you can sort the smaller containers before you write them to disk.
- of course all files have to be sorted by using the same sort criteria.
- and, the query program also must use the same criteria for the binary search.
- for example if the main search criteria is the number value, your binary search
   would compare numbers and not strings as it does now.

so actually i don't see much benefit in using different sort criterias, beside you want to provide two huge files, where one is sorted by name and the second is sorted by number.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40611575
Many many thanks Sara.
Is there no way to use set to work with functor?

If there is really no way for that, how to adjust these
// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include "..\..\include\struc3.h"   
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

using namespace std;
struc3 binrec;
int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long cnt_val = 0;
	std::ofstream files[80];
	std::ofstream file_w;
	std::set<struc3> records_w;
	std::ostringstream filename_w;
	filename_w << "c:\\dp4b\\flout_w.bin";
	std::string strfilename_w = filename_w.str();
	file_w.open(strfilename_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;
	for (int f = 0; f < 80; ++f)
	{
		std::set<struc3> records;
		srand((int)time(NULL));
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		std::string strfilename = filename.str();
		files[f].open(strfilename.c_str(), std::ios::binary | std::ios::out);
		if (!files[f].is_open()) return errno;
		for (cnt = 0; cnt<1000000; cnt++)
		{
			cnt_val++;
			try
			{
				struc3 val = { 0 };
				int j;
				for (j = 0; j<20; j++)
				{
					val.item1[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i1_int = cnt_val;
				strcpy(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				records.insert(val);
				//records_w.insert(val);
				if ((cnt + 1) % 10000 == 0)
				{
					std::cout << val.i1_int << " | " << val.item1 << std::endl;
				}
			}
			catch (exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}
		for (std::set<struc3>::iterator it = records.begin(); it != records.end(); ++it)
		{
			try
			{
				files[f].write((char *)&(*it), sizeof(struc3));
				//file_w.write((char *)&(*it), sizeof(struc3));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}

		records.clear();
		files[f].close();  //
	}
	std::ifstream inputfiles[80];
	struc3 names[80] = { 0 };
	bool eof_reached[80] = { false };
	long long num = 80;   //
	for (int f = 0; f < 80; ++f)
	{
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		inputfiles[f].open(filename.str().c_str(), std::ios::binary | std::ios::in);
		if (!inputfiles[f].is_open())
			return -3; //
		if (!inputfiles[f].read((char*)&names[f], sizeof(struc3)))
			return -4; //
	}

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 80; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].item1 < name_min)
			{
				name_min = names[n].item1;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		file_w.write((char*)&names[n_min], sizeof(struc3));
		if (!inputfiles[n_min].read((char*)&names[n_min], sizeof(struc3)))
		{
			eof_reached[n_min] = true;
			inputfiles[n_min].close();
		}
		if (!file_w.write((char*)&names[n_min], sizeof(struc3)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].item1 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  item1 = " << names[n_min].item1 << "std::endl";
		}
	}
	file_w.close();
	return 0;
}

Open in new window


to use vector instead?

Here is the struct file.
// struc3.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     int  get_len() const
     {
         int len = strlen(item1);
         if (len < (int)sizeof(item1))
             return len;
         return (int)sizeof(item1);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item1, len);
     }
     bool operator< (const struc3 & a2) const
     {
           if(strcmp(item1, a2.item1) < 0) return true;
           if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
           return false;
     }
};

#endif

Open in new window

0
 
LVL 32

Expert Comment

by:sarabande
ID: 40611908
Is there no way to use set to work with functor?

actually, the std::set has a second template argument 'Traits' which is a functor and defaults to std::less<key_type> (what uses the operator< of the key class). the problem is that the functor classes used for the 'Traits' parameter must have a default constructor cause when defining a std::set there is no way to passing arguments to the constructor of the function but if the functor class must use the default constructor, you don't have a chance to 'configure' the sorting.

the following code

   
std::set<threeitems, sort_functor> tiset;
    tiset.insert(threeitems("abc", "xyz", 123));

Open in new window


would give error C2512: 'sort_functor::sort_functor' : no appropriate default constructor available' because of that.

I found a way out by the following:

   
struct sort_functor_item1_asc : public sort_functor
    {
        sort_functor_item1_asc() : sort_functor(eitem1) {}
    };

    std::set<threeitems, sort_functor_item1_asc> tiset1;
    tiset1.insert(threeitems("abc", "xyz", 123));

Open in new window


where you derive from the functor and so create the required default constructor.


how to adjust these to use vector instead?

you have to do 3 things:

- define 'std::vector<nameval> records' instead of 'std::set<nameval> records'.
- use push_back instead of insert
- add std::sort(records.begin(), records.end()) after inserting 1 million of records to the vector.

the last would sort the records by using operator< of struct nameval.

if you want to use a functor you may modify the sort_functor struct I posted and adjust it such that it fits to the nameval struct.

note, technically you have all possibilities with little efforts. but you need to consider what you want to achieve by making the sort criteria dynamically. as told the sort order is crucial for your second program which reads the records. if you make a change to the savebinaryfile you have to make the same change to readbinaryfile. also sorting for the number value makes not so much sense as the number is continuous anyway. if you want to sort your files by number value you don't need any sorting at all but simply could write the records sequentially to the file - what also could be the huge file as you also don't need a merge. if you would simply take your current working programs and write each record you created to a further binary output file (directly when inserting it to the std::set), then your query program could operate on two huge files of same size, one is sorted by name and one is sorted by number. if you search by number you could use the number as a record number and read directly from file ordered by number. and if the user searched by name you could use the binary search. for this there are no functors needed and your efforts are minimal.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40611911
Many thanks Sara.
I
if you want to sort your files by number value you don't need any sorting at all but simply could write the records sequentially to the file - what also could be the huge file as you also don't need a merge. if you would simply take your current working programs and write each record you created to a further binary output file (directly when inserting it to the std::set), then your query program could operate on two huge files of same size, one is sorted by name and one is sorted by number. if you search by number you could use the number as a record number and read directly from file ordered by number. and if the user searched by name you could use the binary search. for this there are no functors needed and your efforts are minimal.

Does it mean, if I have several items to sort, I then need to create several huge files, to sort, depending on that I still use set to the programs?
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40611913
And the point is, better to use set, other than vector, as vector would be slowness, right?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40611992
Does it mean, if I have several items to sort, I then need to create several huge files, to sort, depending on that I still use set to the programs?
yes. it is the same as in a database where you were using multiple index columns or combinations of index columns.

the only difference: while in a database you have two storages one for the data and one for the index (keys) you currently were using the sorted records themselves for index search. that makes sense as your records are mainly keys and therefore it is not necessary to have a separate data storage where the keys are referring to.

however, if you intend to make the number value a second index, things are different. alternatively to creating a second huge file now sorted by number, you could store records with two members such you got a mapping of the number value to the record number in the huge file where the other members are stored.

flout.bin:

[AAAABXC...., 16666666]
[AAAAFGA...,  55111511] 
....
[zzzzsabbc..., 47001234]

flout.idx:

[1, 8868686]
...
[16666666, 0]         -> points to record 0 of flout.bin
...
[47001234, 79999999] -> points to the last of 80 million records
...
[55111511, 1]    -> points to 2nd record of flout.bin

Open in new window


if your next structure has more than 2 members it would make sense to using the above design for all your indexes. that means you would not sort the records at all but simply write the records sequentially to a (huge) data file which is now your database. additionally you would create index files for each index you want to support. for example if you want to have an own index for each member, you would create an index file for each of them. index file creation would be done like your current savebinaryfile works. because of the huge amount of keys, it is not possible to hold 80 million (or more) keys in memory. the way out is, you know, to create smaller chunks of - say 1 million - index files and merge them at end of program to one huge index file. you would do that for each index directly while writing to the data file. the readbinayfile then would have to open 4 files, the data file and the 3 index files. depending on the search request it would decide which index file it has to use for the binary search. as the index file contains the index key, the binary search either would find the key or not. in the first case it would read from data file by using the record number that was stored with the found key in the index file.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40613413
Sara,
I appreciate you a lot.

To have different index file due to one specific order selected, does it mean I should have more than one struct file for that? If yes, what to adjust to this

// struc3.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];

     int  get_len() const
     {
         int len = strlen(item1);
         if (len < (int)sizeof(item1))
             return len;
         return (int)sizeof(item1);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item1, len);
     }
     bool operator< (const struc3 & a2) const
     {
           if(strcmp(item1, a2.item1) < 0) return true;
           if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
           return false;
     }
};

#endif

Open in new window


inside which, I have several items.

And how to adjust the following altogether?
// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include "..\..\include\struc3.h"   
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

using namespace std;
struc3 binrec;
int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long cnt_val = 0;
	std::ofstream files[80];
	std::ofstream file_w;
	std::set<struc3> records_w;
	std::ostringstream filename_w;
	filename_w << "c:\\dp4b\\flout_w.bin";
	std::string strfilename_w = filename_w.str();
	file_w.open(strfilename_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;
	for (int f = 0; f < 80; ++f)
	{
		std::set<struc3> records;
		srand((int)time(NULL));
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		std::string strfilename = filename.str();
		files[f].open(strfilename.c_str(), std::ios::binary | std::ios::out);
		if (!files[f].is_open()) return errno;
		for (cnt = 0; cnt<1000000; cnt++)
		{
			cnt_val++;
			try
			{
				struc3 val = { 0 };
				int j;
				for (j = 0; j<20; j++)
				{
					val.item1[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i1_int = cnt_val;
				strcpy(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);

				for (j = 0; j<20; j++)
				{
					val.item2[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i2_int = cnt_val;
				strcpy(val.i2_val, val.item2);
				strcat(val.i2_val, " ");
				strcat(val.i2_val, val.item2);
				strcat(val.i2_val, " ");
				strcat(val.i2_val, val.item2);

				for (j = 0; j<20; j++)
				{
					val.item3[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i3_int = cnt_val;
				strcpy(val.i3_val, val.item3);
				strcat(val.i3_val, " ");
				strcat(val.i3_val, val.item3);
				strcat(val.i3_val, " ");
				strcat(val.i3_val, val.item3);

				for (j = 0; j<20; j++)
				{
					val.item4[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i4_int = cnt_val;
				strcpy(val.i4_val, val.item4);
				strcat(val.i4_val, " ");
				strcat(val.i4_val, val.item4);
				strcat(val.i4_val, " ");
				strcat(val.i4_val, val.item4);

				records.insert(val);
				//records_w.insert(val);
				if ((cnt + 1) % 10000 == 0)
				{
					std::cout << val.i1_int << " | " << val.item1 << std::endl;
				}
			}
			catch (exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}
		for (std::set<struc3>::iterator it = records.begin(); it != records.end(); ++it)
		{
			try
			{
				files[f].write((char *)&(*it), sizeof(struc3));
				//file_w.write((char *)&(*it), sizeof(struc3));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}

		records.clear();
		files[f].close();  //
	}
	std::ifstream inputfiles[80];
	struc3 names[80] = { 0 };
	bool eof_reached[80] = { false };
	long long num = 80;   //
	for (int f = 0; f < 80; ++f)
	{
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		inputfiles[f].open(filename.str().c_str(), std::ios::binary | std::ios::in);
		if (!inputfiles[f].is_open())
			return -3; //
		if (!inputfiles[f].read((char*)&names[f], sizeof(struc3)))
			return -4; //
	}

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 80; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].item1 < name_min)
			{
				name_min = names[n].item1;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		file_w.write((char*)&names[n_min], sizeof(struc3));
		if (!inputfiles[n_min].read((char*)&names[n_min], sizeof(struc3)))
		{
			eof_reached[n_min] = true;
			inputfiles[n_min].close();
		}
		if (!file_w.write((char*)&names[n_min], sizeof(struc3)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].item1 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  item1 = " << names[n_min].item1 << "std::endl";
		}
	}
	file_w.close();
	std::cout << "the maximum int value is " << integer_limit << std::endl;
	return 0;
}

Open in new window

0
 
LVL 84

Expert Comment

by:ozo
ID: 40613515
Maybe you can have subclasses that all inherit from the parent struct.
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40613523
Can I have more details to the way? Thanks.
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40613714
you need a struct or class for each index.

struct index_name 
{
     char char item1[21];
     unsigned int recnum;
};

struct index_number 
{
     long long number;
     unsigned int recnum;
};


struct index_desc
{
      char description[100];
      unsigned int recnum;
};

Open in new window


instead of 3 structures (classes)you also could use one template class instead.

template <class T>
struct Index
{
     T t;
     unsigned int recnum;
};

typedef char Name[21];
typedef char Desc[100];

Index<Name> name_index;
Index<long long> number_index;
Index<Desc> desc_index;

Open in new window


you also could use std::pair for that purpose or use a base class for all index structures as ozo has suggested.

struct Index
{
      unsigned int recnum;
};

struct Name_Index : public Index
{
      char name[21];
};
// and so on

Open in new window


all these alternatives are equivalent for your purposes. there are a few technical advantages/disadvantages for any of the solutions. I would suggest to start with 3 structures in order to keep it simple. you easily could switch to a better class design if things were working. the method to create the index files could be a copy of  the current mechanism based on the index structures where you create smaller files and merge them to one big file. you would do that for each index separately.

however, before you start with new index structures you should make clear what you want to achieve. actually, I can't see any sense for creating an index for a 64-bit integer value which has the same value as the record number it points to.  obviously such an index is redundant and of no value. also the third member, a 100 byte string, rarely is a good candidate for an index file with fix-sized record length. 100 characters is a maximum size and most strings would have much less text such that your index file would contain mostly zeros what is a bad design. moreover, for an index you would need unique texts what rarely could guaranteed for such texts. in the database world index files were based on b-trees which allow different lengths of the keys and also could handle duplicate or empty keys.

to sum up, if your original structure would contain a 20 char unique name key as first member, an integer which is a sequential number from 1 to 80 million, and a "description" which is any free text of variable length up to 99 characters and could be empty or could contain duplicates, then you should create a data file which is sorted by the description, and have two index files, one for the names and one for the number (which could be an 'int' and not a 'long long'). the binary search for the description could be done directly at the data file as you do now.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40613755
Many thanks Sara.
Is there any example to relate one specific item name of the original struct, to one index in your codes above?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40613902
my last comment would relate to structures

struct nameval
{
     char name[21];
     int    number;
     char desc[100];
     bool operator< (const nameval & nv) const
     {
           if(strcmp(desc, nv.desc) < 0) return true;
           if(strcmp(desc, nv.desc) > 0) return false;
           if(strcmp(name, nv.name) < 0) return true;
           if(strcmp(name, nv.name) > 0) return false;
           if (number < nv.number) return true;
           return false;
     }
};

struct name_index
{
     char name[21];
     unsigned int recnum;
     bool operator< (const name_index & ni) const
     {
           if(strcmp(name, ni.name) < 0) return true;
           // the next two statements can be omitted if the name is unique
           if(strcmp(name, ni.name) > 0) return false;
           if (recnum< ni.recnum) return true;
           return false;
     }
};

struct number_index
{
     int number;
     unsigned int recnum;
     bool operator< (const number_index & ni) const
     {
           if(number < ni.number) return true;
           // the next two statements can be omitted if the number is unique
           if(number > ni.number) return false;
           if (recnum< ni.recnum) return true;
           return false;
     }
};

Open in new window


the first structure contains all three items and will be sorted by the desc + name + number. because of that the data file could be used for a binary search on the 3rd item 'desc'. you could use a std::vector as container for the records since the "desc" is not an unique key. after 1 million records you would sort the vector by using the operator<. you also could use a std::set because the operator< uses name and number to make each record unique. it might be interesting to find out which container is faster. for any case write the records (of struct nameval) to a file. finally all data files will be merged to one huge file.

the other two structures describe the records for the name index and the number index. both should have unique keys. you can't build the index files for name and number parallel to creating the data records because the final record number will not be known before merge of the data file. so the index files must be created after merge by reading the data file sequentially and inserting both name_index and number_index to containers (either std::set or std::vector). after 1 million entries you would store the sorted containers to a file and clear the containers for the next chunk. finally the index files need to be merged into big files.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40615816
Many thanks Sara.

I think only writing the big file, I should have several file copies, due to different indexes, right?

How to refer to different struct, due to different items below

     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];
     ...

Open in new window

since in the original .h file, I now have several items shown in above?
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40615818
Correction: I think only when writing the big file, I should have several file copies, due to different indexes, right?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40616199
no, if you follow the design I described in my last post, you would have 3 "big" files:

1 data file sorted  by combination description+name+number. its packed record size is 125 (unpacked 128) bytes.
1 index file sorted by name. its record size is 24 (20 for name + 4 for record number)
1 index file sorted by number. its record size is 8 (4 for number and 4 for record number).

though the last file is 16 times smaller than the data file, it is still too big to be fully stored in memory. if using a vector you would not get the contiguous memory storage needed. and if using a set, the number of pointers used would lead to severe swapping such that your program would last hours if not days to creating the file.

because of that you need to create smaller files first which could be sorted in memory and merge them to one big file what doesn't much memory.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40618009
Many thanks Sara.

My current struct is having 4 items inside and I did create small-size files and big file to include all 4 items. I know we can have specific struct for the expected order we want to have, to one specific item? or i?_int. I think I have to create different big file for different index I expect to have. How to adjust the following, to create the big file, per new struct of the expected item?/i?_int ordering?


	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 80; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].item1 < name_min)
			{
				name_min = names[n].item1;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		file_w.write((char*)&names[n_min], sizeof(struc3));
		if (!inputfiles[n_min].read((char*)&names[n_min], sizeof(struc3)))
		{
			eof_reached[n_min] = true;
			inputfiles[n_min].close();
		}
		if (!file_w.write((char*)&names[n_min], sizeof(struc3)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].item1 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  item1 = " << names[n_min].item1 << "std::endl";
		}
	}
	file_w.close();
	...

Open in new window

0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40618037
Here is one sample small-size file, that is having all 4 items inside.
https://dl.dropboxusercontent.com/u/40211031/flout0.zip
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40618300
Per my understanding to have different structs as you've advised, we need to have record number to all struct, while within my original struct, the files would be sorted by record number, and each other different big file, which is ordered by one item? or i?_val, would be further searched per given item values, either item? or i?_val, and we would finally use the captured record number, to search against the original big file, to catch only one record. Is this correct?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40618403
Here is one sample small-size file, that is having all 4 items inside
I can't open a binary zip file until tomorrow. I don't actually know what you mean by 4 'items'. do you mean 4 records? or do you mean 4 members?

to have different structs as you've advised, we need to have record number to all struct
what do you mean by that?


my suggestion uses an original structure with 3 members: a name of 21 characters, a 32-bit number, a description of 100 characters. so the main data file has a record size of 125 bytes (or 128 bytes if aligned to 32-bit boundary). it doesn't need a member for record number because the record number is known when reading the structure from file. if the file is sorted by the 100-character text value (+ name as second sort criteria if texts are not unique) you could do a binary search for the text with the method you now used in readbinaryfile. if the texts are not unique such a search may have more than one match. the binary search would find one of a group of identical texts and you would have to search from the record position in both directions to get the whole result set. but, since the file is sorted by text, all records with same text definitively build a contiguous block within the data file.

for the index files, you would have a different design. an index record structure is a pair of key+recordnumber. the record numbers point to the data file.

here a sample with 5 records:

data file 5 records. it is ordered by 3rd column text.

                name  number   text
whatever    33            Axy zjk...
anykey        100          Dabc G..
some           29            Gxxx ku...
name           531         Maaa bJ...
other           41            Szavh...

Open in new window


name index file has also 5 records. it contains name column as key and points to record in data file. it is sorted by name and you can do a binary search if you lookup for a key. for example if you look for 'some' you would first check the middle key 'other' and find out that 'some' is greater 'other'. because of that you read the middle of the second half and read 'some' which is the match.

                name  recnum
anykey         2
name           4
other           5 
some           3
whatever    1

Open in new window


number index file has also 5 records. it contains number column as key and points to record in data file. it is sorted by number and you can do a binary search if you lookup for a given number. for example if you look for 35 you would first check the middle key 41 and find out that 35 is less than 41. because of that you read the middle of the first half and read 33 which is less than 35. the next 'middle' is 33 again, what means that 35 cannot be found.

            number  recnum
29                  3
33                  1
41                  5
100                2
531                4

Open in new window


to sum up, the original file is sorted by 3rd member. hence, you don't need an extra index file for this column. the two other columns will be stored separately in index files and point to the record numbers in the data file. you can do a binary search on those keys but have to read into the data file for a hit to get all data for the entry.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40618418
Many many thanks Sara.

Within my previous struct header file, I have

     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];
     ...

Open in new window

We actually use the big file to do search, right? And I have different items shown in above, and I expect to sort on one specific item only, upon the given need. It means we have to create different "big" file sorted on given item or item "long" integer (with name like i?_val), right? How to create the big file, being sorted on one specific item name or item integer?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40618570
can you answer my question what you mean by item1, item2, item3, item4, .... ?

why do you have a struct which has multiple triples of members? why not using an array? and what has the new structure to do with the current namval, savebinaryfile, readbinaryfile sources?


How to create the big file, being sorted on one specific item name or item integer?

to build the three files you already have code for all the functionality needed.

data file:
use structure nameval as posted by me. it already has a valid operator< defined such that it would sort by 3rd text member.
- create 80 files with 1 million entries each by using a std::set to get the entries sorted per file(std::vector probably would need too many contiguous memory).
- merge all 80 files to one
- reopen the huge file and read it sequentially
- that way you can count the record number from 1 to 80 million
- use structure name_index and number_index as posted by me. both already have a valid operator< defined.
- use two std::set containers, one for each structure.
- for each index create 80 files with 1 million entries each by using the std::set to get the entries sorted by name respectively by number. as the structures for index are small you could create both the index files parallel.
- for each index merge all 80 files to one.
- for name index you have to use an array of 80 names to find the current minimum.
- for number index use an array of 80 integers.

finally you would have 3 big files where each of them could be used for a binary search. if you have a hit when using one of the index files you finally have to read the data file at record number which you got from the found index entry.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40618696
Many thanks Sara.

I need to have about 4 names and 4 numbers, to the original struct due to different purpose, to each record, and I expect to be able to search by one of them. This is why I showed you the struct having 4 items. Does it mean I should have 6 extra structs, like what you showed by name_index, and number_index?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40618961
I need to have about 4 names and 4 numbers, to the original struct due to different purpose
ok. are the 4 names and 4 numbers unique? I mean do you have 320 million different names and 320 million different numbers for the 80 million records? or will you have empty names or zero numbers? what about the 100 char texts? do you have 4 such texts in a record or only one? if you have 4 names, 4 numbers, and 4 texts why not using 4 records instead? what is the reason that you want to put them into one record?

note, if you would create a huge file out of your structure, you also could read records from the file which contain only one item. that means if you make index files for names and numbers, the entries in those index files also could point to record numbers of records which contain only 1 name, number, and text.

if you would have (up to) 4 names, (up to) 4 numbers, and only 1 text, you should use the concept I described above with a little change. you would have a data file sorted by text what makes sense as there is only 1 text per record. hence you later can do a quick search on any text (or text begin) on the data file itself. then you would create only 1 index file for the names and 1 index file for the numbers. these files would get not 80 million entries but up to 320 million entries granted that you always have 4 names and 4 numbers per data record. but you easily could handle a dynamic number of names and numbers per data record as well since you build the index files by reading from the data records. so you are free to add 1, 2, 3 or 4 names to the name index, and 1, 2, 3, or 4 numbers to the number index. so some records of the data file would be referenced up to 8 times from the index files and others maybe only two times. note, the merge mechanism will also work if the files you were merging have a different number of entries. so regardless how many index entries you have collected in a set, you always could write it to a file and open a new file and clear the old set.

if you have 4 names, 4 numbers, and (up to) 4 texts you would not sort the data files but write them sequentially without using a set to the huge file. that means you also don't need to merge the data file. additionally, you can create the index files directly while writing the data file because the record number you need to store in the index entries is simply the loop counter from 1 to 80 million. note, you couldn't search for texts if you do so, beside you create an index file for the texts as well. if this could be a requirement you may think of creating a 3rd index file for texts where you don't store all text but for example non-trivial nouns or words. same as for name and number you may use an arbitrary count of strings from texts per data record. however, you should limit the "word length" for example to 20 characters such that the index file would not exceed the size of the data file.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40620476
Many thanks Sara.

OK, now 1st big file is ordered by item1 and i1_int, like before. And I want to create one other big file sorting on item2 and i2_int (this file will provide the way to search by item2 instead), and I have these codes
// struc3.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];

     int  get_len() const
     {
         int len = strlen(item1);
         if (len < (int)sizeof(item1))
             return len;
         return (int)sizeof(item1);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item1, len);
     }
     bool operator< (const struc3 & a2) const
     {
           if(strcmp(item1, a2.item1) < 0) return true;
           if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
           return false;
     }
};

#endif

// struc3c.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3c
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];

     int  get_len() const
     {
         int len = strlen(item2);
         if (len < (int)sizeof(item2))
             return len;
         return (int)sizeof(item2);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item2, len);
     }
     bool operator< (const struc3c & a2) const
     {
           if(strcmp(item2, a2.item2) < 0) return true;
           if(strcmp(item2, a2.item2) > 0) return false;
           if (i2_int < a2.i2_int) return true;
           return false;
     }
};

#endif

// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include "..\..\include\struc3.h"   
//#include "..\..\include\struc3b.h"   
#include "..\..\include\struc3c.h"   
//#include "..\..\include\struc3d.h"   
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

using namespace std;
struc3 binrec;
int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long cnt_val = 0;
	std::ofstream files[11];
	std::ofstream file_w;
	std::set<struc3> records_w;
	std::ostringstream filename_w;
	filename_w << "c:\\dp4b\\flout_w.bin";
	std::string strfilename_w = filename_w.str();
	file_w.open(strfilename_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;

	std::ofstream file_w3;
	std::set<struc3c> records_w3;
	std::ostringstream filename_w3;
	filename_w3 << "c:\\dp4b\\flout_w3.bin";
	std::string strfilename_w3 = filename_w3.str();
	file_w3.open(strfilename_w3.c_str(), std::ios::binary | std::ios::out);
	if (!file_w3.is_open()) return errno;
	...

Open in new window

how to correct these?
Error	4	error C2065: 'struc3c' : undeclared identifier	F:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	45	1	SaveBinaryFile
Error	5	error C2923: 'std::set' : 'struc3c' is not a valid template type argument for parameter '_Kty'	F:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	45	1	SaveBinaryFile
Error	6	error C2133: 'records_w3' : unknown size	F:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	45	1	SaveBinaryFile
Error	7	error C2512: 'std::set' : no appropriate default constructor available	F:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	45	1	SaveBinaryFile

Open in new window

0
 
LVL 32

Expert Comment

by:sarabande
ID: 40620642
#ifndef NAME_VAL_H
because of that preprocessor statement the struc3c struct was not recognized by the compiler.

each headerfile needs its own macro to protect from being included twice. you better would derive the macro from file name to avoid such issues.

//struc3c.h
#ifndef STRUC3C_H
...
#endif
// - eof -

Open in new window


Sara
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40620662
And I want to create one other big file sorting on item2 and i2_int
i wonder how long it takes to creating such huge files where three-fourths of each record is only wasted space.

your current record size is 516 bytes and you could go with 128 instead, still having 4 names and 4 numbers plus text.

one of your files is 40 gb in size. so if you want to support all 4 names as keys and all 4 numbers you need 320 gb.

if using one big file with record size 128 the file size is 10 gb. two index files with 320 million entries each would take 8 gb for name index and 2.5 gb for number index.

you have all the instruments for to build those files and contrary to your current approach it is less complex and doesn't require to exchange header files and get issues because of redundant code.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40620856
one of your files is 40 gb in size. so if you want to support all 4 names as keys and all 4 numbers you need 320 gb.

if using one big file with record size 128 the file size is 10 gb. two index files with 320 million entries each would take 8 gb for name index and 2.5 gb for number index.

you have all the instruments for to build those files and contrary to your current approach it is less complex and doesn't require to exchange header files and get issues because of redundant code.

Many thanks Sara.
What other ways should be adopted, save the space wasted?
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40621024
Correction:

What other ways should be adopted, to save the space wasted?

And I don't know why the 2nd big file cannot be created, using these

// struc3.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];

     int  get_len() const
     {
         int len = strlen(item1);
         if (len < (int)sizeof(item1))
             return len;
         return (int)sizeof(item1);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item1, len);
     }
     bool operator< (const struc3 & a2) const
     {
           if(strcmp(item1, a2.item1) < 0) return true;
           if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
           return false;
     }
};

#endif

// struc3c.h
#ifndef STRUC3C_H
#define STRUC3C_H

struct struc3c
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];

     int  get_len() const
     {
         int len = strlen(item2);
         if (len < (int)sizeof(item2))
             return len;
         return (int)sizeof(item2);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item2, len);
     }
     bool operator< (const struc3c & a2) const
     {
           if(strcmp(item2, a2.item2) < 0) return true;
           if(strcmp(item2, a2.item2) > 0) return false;
           if (i2_int < a2.i2_int) return true;
           return false;
     }
};

#endif

// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include "..\..\include\struc3.h"   
//#include "..\..\include\struc3b.h"   
#include "..\..\include\struc3c.h"   
//#include "..\..\include\struc3d.h"   
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

using namespace std;
struc3 binrec;
int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long cnt_val = 0;
	std::ofstream files[7];
	std::ofstream file_w;
	std::set<struc3> records_w;
	std::ostringstream filename_w;
	filename_w << "c:\\dp4b\\flout_w.bin";
	std::string strfilename_w = filename_w.str();
	file_w.open(strfilename_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;

	std::ofstream file_w3;
	std::set<struc3c> records_w3;
	std::ostringstream filename_w3;
	filename_w3 << "c:\\dp4b\\flout_w3.bin";
	std::string strfilename_w3 = filename_w3.str();
	file_w3.open(strfilename_w3.c_str(), std::ios::binary | std::ios::out);
	if (!file_w3.is_open()) return errno;

	for (int f = 0; f < 7; ++f)
	{
		std::set<struc3> records;
		srand((int)time(NULL));
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		std::string strfilename = filename.str();
		files[f].open(strfilename.c_str(), std::ios::binary | std::ios::out);
		if (!files[f].is_open()) return errno;
		for (cnt = 0; cnt<1000000; cnt++)
		{
			cnt_val++;
			try
			{
				struc3 val = { 0 };
				int j;
				for (j = 0; j<20; j++)
				{
					val.item1[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i1_int = cnt_val;
				strcpy(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);

				for (j = 0; j<20; j++)
				{
					val.item2[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i2_int = cnt_val;
				strcpy(val.i2_val, val.item2);
				strcat(val.i2_val, " ");
				strcat(val.i2_val, val.item2);
				strcat(val.i2_val, " ");
				strcat(val.i2_val, val.item2);

				for (j = 0; j<20; j++)
				{
					val.item3[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i3_int = cnt_val;
				strcpy(val.i3_val, val.item3);
				strcat(val.i3_val, " ");
				strcat(val.i3_val, val.item3);
				strcat(val.i3_val, " ");
				strcat(val.i3_val, val.item3);

				for (j = 0; j<20; j++)
				{
					val.item4[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i4_int = cnt_val;
				strcpy(val.i4_val, val.item4);
				strcat(val.i4_val, " ");
				strcat(val.i4_val, val.item4);
				strcat(val.i4_val, " ");
				strcat(val.i4_val, val.item4);
				strcat(val.i4_val, " ");
				strcat(val.i4_val, "????");

				records.insert(val);
				//records_w.insert(val);
				if ((cnt + 1) % 10000 == 0)
				{
					std::cout << val.i1_int << " | " << val.item1 << std::endl;
				}
			}
			catch (exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}
		for (std::set<struc3>::iterator it = records.begin(); it != records.end(); ++it)
		{
			try
			{
				files[f].write((char *)&(*it), sizeof(struc3));
				//file_w.write((char *)&(*it), sizeof(struc3));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}

		records.clear();
		files[f].close();  //
	}
	std::ifstream inputfiles[7];
	struc3 names[7] = { 0 };
	bool eof_reached[7] = { false };
	long long num = 7;   //
	for (int f = 0; f < 7; ++f)
	{
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		inputfiles[f].open(filename.str().c_str(), std::ios::binary | std::ios::in);
		if (!inputfiles[f].is_open())
			return -3; //
		if (!inputfiles[f].read((char*)&names[f], sizeof(struc3)))
			return -4; //
	}

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 7; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].item1 < name_min)
			{
				name_min = names[n].item1;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		file_w.write((char*)&names[n_min], sizeof(struc3));
		if (!inputfiles[n_min].read((char*)&names[n_min], sizeof(struc3)))
		{
			eof_reached[n_min] = true;
			inputfiles[n_min].close();
		}
		if (!file_w.write((char*)&names[n_min], sizeof(struc3)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].item1 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  item1 = " << names[n_min].item1 << "std::endl";
		}
	}
	file_w.close();

	write_counter = 0;
	while (true)
	{
		std::string name_min3;
		long long n_min3 = -1;
		for (int n = 0; n < 7; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min3<0 || names[n].item2 < name_min3)
			{
				name_min3 = names[n].item2;
				n_min3 = n;
				continue;
			}
		}
		if (n_min3 < 0) break; // if all files were closed you are done
		file_w3.write((char*)&names[n_min3], sizeof(struc3c));
		if (!inputfiles[n_min3].read((char*)&names[n_min3], sizeof(struc3c)))
		{
			eof_reached[n_min3] = true;
			inputfiles[n_min3].close();
		}
		if (!file_w3.write((char*)&names[n_min3], sizeof(struc3c)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min3].item2 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min3 = " << n_min3 << "  item2 = " << names[n_min3].item2 << "std::endl";
		}
	}
	file_w3.close();

	std::cout << "the maximum int value is " << integer_limit << std::endl;
	return 0;
}

Open in new window

0
 
LVL 32

Expert Comment

by:sarabande
ID: 40622845
if you want to create 2 big files which were sorted differently you have to copy all parts of the code in main function not only the merge part. all the 1 million record files were sorted by item1. hence, it makes no sense to merge them by item2.

moreover, you didn't even close the files before your second merge loop, nor did you initialize the eof_reached flags. both mistakes prevent the second merge loop to read any new records from files and therefore your second file is empty. but even if you would solve these merge issues and reopen the smaller files and reset the eof flags, the total file would not be sorted by item2 because the smaller files are sorted by item1.

i strongly recommend to not going this way which is a dead-end. instead let us make classes and functions such that we could make the current functionality reusable and were able to creating a smart database with a couple of index files by using code we already have and only needed to be parameterized.

if you ignore my advise and nevertheless want to create a couple of huge files each of them sorted differently, you could make 4 copies of your program and do the appropriate changes in each of them. omit the header files and define the structures directly above the main function. that way you also easily could adopt operator< to your needs. and you wouldn't need more help as you already have one working copy ....

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40623095
i strongly recommend to not going this way which is a dead-end. instead let us make classes and functions such that we could make the current functionality reusable and were able to creating a smart database with a couple of index files by using code we already have and only needed to be parameterized.

Many thanks Sara.

Using your given struct from your previous replies, should I then create both the small-size file and big file, to the relevant struct you showed?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40624284
my recommendation is to make one big file not sorted at all and put all names and numbers to index files (the names of item1, item2, item3 and item4 would be in the same index file, same as the numbers of item?). each index entry would point to the original record number of the big file. when doing so, you don't need to create small files for the big data file but could create and write the big data file sequentiall with one loop. in the same loop you would create small files both for name index and number index. after 4 million entries (for all 4 items) you would write the files from their sorted set, close them and create a new small file where you start with empty sets for the next portion of small keys.

before i make a suggestion how you could build classes and functions out of your current code, i would need you to answer a question which i have asked before. what is the purpose of the i?_val text member? do you need one per record or one per item? your current code concatenates the name for each item a few times , what doesn't seem to be a meaningful use case.

if the texts are just arbitrary for the moment, i would suggest to use a little function which creates some kind of sentences for the texts, similar like you did it for the names:

int randindex(int maxrand, int off = 0)
{
   return (rand()%maxrand)+off;
}

std::string createItemText(int maxlen)
{
    static std::string vocals ="aeiouy";
    static std::string consonants = "bcdfghjklmnpqrstxz";
    static std::string prevocals[] = { "bl", "br", "ch", "cl", "cr", "dr", "dw", "fl", "fr", "gl", "gh", "gr", 
                                       "kl", "kn", "kr", "pl", "pr", "qu", "rh", "sch", "sh", "sp", "sl", "sk", 
                                       "st", "str", "spr", "th", "tr", "wh", "wr", };
    static std::string postvocals[] = { "ch", "nd", "rd", "nt", "rt", "rp", "rch", "sh", "wn", "rg", "rg","rl","th", "tt", "ss"};
    static int NV = vocals.length();                            
    static int NC = consonants.length();                            
    static int NPRE = sizeof(prevocals)/sizeof(prevocals[0]);
 
    static int NPOST = sizeof(postvocals)/sizeof(postvocals[0]);
    static int NW = 6;
    static int NL   = 12;
    static int NS   = 4;
    
    int nw = randindex(NW, 2);    // words of sentence
    std::string strspace;
    std::ostringstream os;
    while (--nw >= 0)
    {
           os << strspace;
           strspace = " ";
           int ns = randindex(NS, 1); // syllables
           bool bpre = (randindex(2) == 0);
           bool bpost = false;
           bool bvoc = false;
           while (--ns >= 0)
           {
                if (bpre == true)
                { 
                     bpre = false;
                     int np = randindex(NPRE);
                     os << prevocals[np];
                     if (ns == 0) ns = randindex(2, 1); // use at least 2 syllables
                     bvoc = true;
                }
                else if (bvoc == true)
                  { 
                     bvoc = false;                     
                     bpost = true;
                     int nv = randindex(NV);
                     if (ns == 0) ns = randindex(2); // at least 50 percent should not end with a vocal
                     os << vocals[nv];
                }
                else if (bpost == false || (randindex(2)==0))  // 50 percent use a single consonant after a vocal 
                {
                     bvoc = true;
                     int nc = randindex(NC);
                     if (bpost == false) ns += randindex(2, 1);  
                     os <<consonants[nc];
                }
                else 
                {
                     bpost = false;
                     int np = randindex(NPOST);
                     os << postvocals[np];
                     bvoc = true;
                 }
            }        
      }
      
      return os.str().substr(0, maxlen);
}

Open in new window

     

this function makes some funny sentences of arbitrary length using a language never heard before. you would call it like

std::string strval = createItemStruct(sizeof(i1_val)-1); 
strcpy_s(val.i1_val, sizeof(val.i1_val), strval.c_str());

Open in new window


wherever you want a long arbitrary text.

a similar weakness of your current approach is the i?_int member. this number currently is the same for all 4 items what apparently is a redundance of no value same as with the texts. furthermore it is identical to (record number - 1) if we would write the records without sorting them to the huge file. so, actually you simply could remove all i?_int members without any loss of information. to improve that you simply would create an arbitrary 64-bit number like that

long long createItemNumber()
{
    long long number1 = rand() + 12345;
    long long number2 = rand() + 98765;
    return (number1*(number2+1)*(number1+number2+77777));
}

Open in new window



note, it is a good chance that the above number is unique even for 320 million entries. the same applies for the names you were creating. but you only could make sure that they were unique by validating this when creating the merged files. when the minimum was determined you would have a chance to find duplicates (names or numbers) which then could be corrected by making a little increment to the current name or number.

What other ways should be adopted, to save the space wasted?
if you want to go the recommended way, we also will find a solution for this by storing the texts into another file with variable lengths. the big data file would not have 400 bytes wasted space per record but only "pointers" to the packed text file.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40625119
Many thanks Sara.
i?_val is one item value that is store some description of the item having length of about 100. Is there any details to demonstrate the way to create the relevant index file that is being sorted on i?_int or i?_val?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40625209
the index files were based on struct index_name and struct index_number. both structures were derived from a base struct.

struct index_base
{
       unsigned int recnum;
};

struct index_name : public index_base
{
     char name[21];
     bool operator< (const index_name & ni) const
     {
           if (strcmp(name, ni.name) < 0) return true;
           return false;
     }
};

struct index_number : public index_base
{
     long long number;
     bool operator< (const index_number & ni) const
     {
           if (number < ni.number) return true;
           return false;
     }
};

Open in new window


your new save program would open the big file flout.dat. its records would be based on structure fouritems:

struct item
{
       char name[21];
       long long number;
       char description[100];
       bool operator< (const item & ni) const
       {
           if(strcmp(name, ni.name) < 0) return true;
           if(strcmp(name, ni.name) > 0) return false;
           if (number< ni.number) return true;
           return false;
      }
};

struct fouritems
{
       item items[4];
};

Open in new window


you don't need a std::set for fouritems struct as we would write the records unsorted to the file.

your new design would be

long long recnum = 0
create file for data file
      - use a function for creating the filename where you pass a prefix and an index
            - index == -1 means: don't add a suffix '_i' to the file name
for loop 80 counting f
     create small file for name index
          - use function for creating the filename passing "flname_idx" and counter f as arguments
     create small file for number index
          - use function for creating passing "flnum_idx" and counter f as arguments
     create set<index_name>
     create set<index_number>
     for loop 1 million counting n
           create record for 'fouritems' 
           for loop 4 counting i 
                 fill item i with random data.
                     - use a function to create name, number, and description of each item
                 create a index_name by using name and recnum
                 insert name of item to name set
                 create a index_number by using number and recnum
                 insert number of item to number set
           end for loop 4
           write 'fouritems' record to data file
           increment recnum
      end for loop 1 million
      for each index_name in set index_name 
             write index_name to small file index_name
      end for each
      close index_name file
      clear name_index set
      for each index_number in set number index 
             write index_number to small file index_number
      end for each
      close index_number file
      clear index_number set
end for loop 80
close big data file

Open in new window


note, you may not try to use your old code and make changes. instead try to make functions for any code part which was used more than once. copy the functionality from your old code, make a function out of it, and use only a call in your new main function.

if we got the new design compiled we finally would make an application class and turn the functions to member functions. we also would add functionality to have a separate data file for the texts such that the wasted space now could be avoided.

Sara
0
How to run any project with ease

Manage projects of all sizes how you want. Great for personal to-do lists, project milestones, team priorities and launch plans.
- Combine task lists, docs, spreadsheets, and chat in one
- View and edit from mobile/offline
- Cut down on emails

 
LVL 32

Expert Comment

by:sarabande
ID: 40625218
note, the index_base was not used so far. it will get into use when we design the binary search and either search for names or numbers.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40627395
Many many thanks Sara.
Can I have more details to these?

     create set<index_name>
     create set<index_number>

Open in new window

0
 
LVL 32

Expert Comment

by:sarabande
ID: 40627538
std::set<index_name> nameset;
std::set<index_number> numberset;

Open in new window


if you put these definitions into the f loop the sets will be cleared automatically at end of each loop circle.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40627613
Sorry, fouritems is now referring to 4 items, right? How about that sometimes I only want to sort on one specific item, how?
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40627614
I mean to the big file.
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40627859
the big file is not sorted at all. with the new design you can search for all items regardless whether they were item1, item2, item3 or item4.

if you need a list of all records sorted - say - by item3 name only, you would read sequentially from name index file, then read the record from big file at the record number where the index is pointing to. then check whether the name you have read from index equals the item3 name. if yes, print the record (or write it to file). if not, skip the record.

for example if you have records with the following names

"B" "C" D" "E"
"Z" "W" "V", "Y"
"M" "N" "A" "P"

Open in new window

your index file would be (be aware: record numbers are 0-based)

"A" -> 2
"B" -> 0
"C" -> 0
"D" -> 0
"E" -> 0
"M" -> 2
"N" -> 2
"O" -> 2
"V" -> 1
"W" ->1
"Y" -> 1
"Z" ->1

Open in new window


so if you read the index sequentially and take only the records where the index name is item3 name you get the records

2:  "M" "N" "A" "P"
0:   "B" "C" D" "E"
1:  "Z" "W" "V", "Y"

Open in new window


which are ordered by item 3.

Sara
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40627893
note, if sorted lists from big file per item is a requirement you may speed-up the creation of such lists by adding the item index to the index structure. that way you could decide from index record whether it belongs to the required index or not.

that also would help if you want to search for a name in the item 3 name column only. then the binary search algorithm would skip all names when reading from index file which are not item 3 names.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40629901
Many thanks Sara. should I put recnum to both index_name and index_number structs?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40630169
if you derive from index_base as I suggested recnum "IS" a member both of index_name and index_number.

you could add member itemnum (values: 1,2,3,4) to index_base if you want to support an item specific search.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40630251
Sorry, can you please give me more details of the mechanism, to use the 2 indexes, to locate the item record? Thanks a lot
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40630298
did you already succeed in creating the datafile and the two index files?

the search mechanism is identical to that currently used in readbinaryfile with two exceptions.

First, you would not open the datafile to do the binary search but one of the two (merged) index files, depending on whether you want to search for name or number. you also would calculate nbegin, nend, nmid by using the st_size member of the __stat64 structure where you called _stat64 function for the index file (and not of the data file).

Second, if have found an index record that matches, you would open the data file and read the items record (use struct fouritems fro to read). you would need to calculate the file position by multiplying the recnum member of the index structure with the size of the items records which is sizeof(fouritems). then, use seekg function to position within the data file before reading.

then you can print members of all 4 items of the record as result of your query. one of the 4 items of the record matches with the search criteria.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40632314
Sorry Sara.

To save space, we can skip the step to create small files for both index_name and Index_number, but to only have the big files for both, right?
Probably we only need to keep small files for items only, right?
0
 
LVL 32

Assisted Solution

by:sarabande
sarabande earned 500 total points
ID: 40632559
we can skip the step to create small files for both index_name and Index_number
no. not at all. the necessity to creating small sorted files is because of the limited memory that does not allow to sort hundred million entries in one step. you already have the code to this. the only work left is to exchange the structures, containers and variables.

I will show you what to do next for a little part of your current code.

when creating items of a record you generate a key which has 20 letters using the following loop:

int j;
for (j = 0; j<20; j++)
{
     val.item1[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
}

Open in new window


you repeat this code part 3 times for item2, item3, and item4.

you now should replace the loop by a function call and in the function you additionally could "repair" the bug that the assignment was done by "+=" instead of "=" (the bug doesn't matter if the char array was all zeros).

void createItemName(char name[], int sizname])
{
   for (int j = 0; j<sizname-1; j++)
   {
       name[j] = (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
    }
    name[sizname-1] = '\0';  
}

Open in new window


when filling a 'fouritems' record you now can do

fouritems record = { 0 };
for (int j=0; j<4; ++j)
{
     createItemName(record.items[j].name, 21);
     record.items[j].number = createItemNumber();
     strcpy_s(record.items[j].description, 99, createItemText(99).c_str());
}

Open in new window


you even could simply the above by creating a member function in fouritems struct:

struct fouritems
{
       item items[4];
       void fillItems();
};
...
void fouritems::fillItems()
{
   for (int j=0; j<4; ++j)
   {
     createItemName(record.items[j].name, 21);
     record.items[j].number = createItemNumber();
     strcpy_s(record.items[j].description, 99, createItemText(99).c_str());
    }
}

Open in new window


such that your main function only would do

fouritems record = { 0 };
record.fillItems();

Open in new window


and even that could be reduced to one statement if you would provide a constructor of fouritems which fills the items automatically:

struct fouritems
{
       item items[4];
       fouritems(bool bfill = false); 

};
...
void fouritems::fouritems(bool bfill)
{
    memset(this, 0, sizeof(fouritems));
    if (bfill == true)
    {
        for (int j=0; j<4; ++j)
        {
           createItemName(record.items[j].name, 21);
           record.items[j].number = createItemNumber();
           strcpy_s(record.items[j].description, 99, createItemText(99).c_str());
        }
    }
}

...
fouritems record(true);   // record automatically filled by constructor

Open in new window


you see that the code is much leaner and also can be maintained better. in the above code you finally would turn the create... functions to member functions of the struct item, what then is an inherently consistent design.

To save space, ....
you can save space if you write the variable length texts packed to a further data file and have the items file only point into the packed text file. we will do that optimization after you created the index files.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40632702
Many thanks Sara.
Error	4	error C2533: 'fouritems::{ctor}' : constructors not allowed a return type	Z:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	95	1	SaveBinaryFile
Error	5	error C2065: 'record' : undeclared identifier	Z:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	101	1	SaveBinaryFile
Error	6	error C2228: left of '.items' must have class/struct/union	Z:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	101	1	SaveBinaryFile
Error	7	error C2228: left of '.name' must have class/struct/union	Z:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	101	1	SaveBinaryFile
Error	8	error C2065: 'record' : undeclared identifier	Z:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	102	1	SaveBinaryFile

Open in new window

to the last 3 lines below

// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

struct index_base
{
	unsigned int recnum;
};

struct index_name : public index_base
{
	char name[21];
	bool operator< (const index_name & ni) const
	{
		if (strcmp(name, ni.name) < 0) return true;
		return false;
	}
};

struct index_number : public index_base
{
	long long number;
	bool operator< (const index_number & ni) const
	{
		if (number < ni.number) return true;
		return false;
	}
}; 

struct item
{
	char name[21];
	long long number;
	char description[100];
	int  get_len() const
	{
		int len = strlen(name);
		if (len < (int)sizeof(name))
			return len;
		return (int)sizeof(name);
	}
	void get_uni_nm(wchar_t nm_uni[], int sizfld) const
	{
		int len = get_len();
		if (len > sizfld)
			len = sizfld;
		mbstowcs(nm_uni, name, len);
	}
	bool operator< (const item & ni) const
	{
		if (strcmp(name, ni.name) < 0) return true;
		if (strcmp(name, ni.name) > 0) return false;
		if (number< ni.number) return true;
		return false;
	}
};

using namespace std;
item binrec;

void createItemName(char name[], int sizname)
{
	for (int j = 0; j<sizname - 1; j++)
	{
		name[j] = (char)(rand() % 26 + ((rand() % 2) ? 65 : 920));
	}
	name[sizname - 1] = '\0';
}

struct fouritems
{
	item items[4];
	fouritems(bool bfill = false);

};

void fouritems::fouritems(bool bfill)
{
	memset(this, 0, sizeof(fouritems));
	if (bfill == true)
	{
		for (int j = 0; j<4; ++j)
		{
			createItemName(record.items[j].name, 21);
			record.items[j].number = createItemNumber();
			strcpy_s(record.items[j].description, 99, createItemText(99).c_str());
		}
	}
}

Open in new window


while I did some other correction to the project.
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40632731
constructors not allowed a return type

remove the 'void' before constructor function

the other error are following errors of the first one.

Sara
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40633233
by the way: I wonder why you still have a global variable 'binrec' defined. you easily should see that 'binrec' is no longer used since months.

any code which is  no longer used is ballast and should be removed. the same applies for functions never called like the get_uni or get_len which would make sense if you would handle wide strings as well or have keys of different length. but you don't have neither the one nor the other.

for your information: if one uses a struct rather than a class, they normally do so, because they want to access members from outside without using member functions. in a class you normally would have private or protected members and "setters" and "getters" member functions for accessing them outside of the class.

however, this is only convention to do so, and actually there is no difference (in c++) between struct and class beside that for the struct the members default to be public if not defined differently while in a class they default to be private.

our structures are pure data structures what means they could be used to exchange data records from file to structure and reverse. so we don't want to make them complex objects and use struct rather than class. nevertheless, c++ functionality like constructors, operator< make sense also for struct and can help to achieve a better design.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40634610
Many thanks Sara.
I don't know which record you're referring to, on last 3rd line below?

fouritems::fouritems(bool bfill)
{
	memset(this, 0, sizeof(fouritems));
	if (bfill == true)
	{
		for (int j = 0; j<4; ++j)
		{
			createItemName(record.items[j].name, 21);
			record.items[j].number = std::rand() % 10000000;
			strcpy_s(record.items[j].description, 99, createItemText(99).c_str());
		}
	}
}

Open in new window

0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40634754
what should be "record" to this

record.items[j].name

Open in new window


Thanks a lot and appreciate a lot to your help!
0
 
LVL 32

Assisted Solution

by:sarabande
sarabande earned 500 total points
ID: 40634893
sorry. remove 'record.' from the statements. in a constructor 'record' was not passed as argument, but is the object itself which can be omitted or accessed via this->

fouritems::fouritems(bool bfill)
{
     memset(this, 0, sizeof(fouritems));
     if (bfill == true)
     {
            for (int j = 0; j<4; ++j)
            {
	createItemName(items[j].name, 21);
	items[j].number = std::rand() % 10000000;
	strcpy_s(items[j].description, 99, createItemText(99).c_str());
            }
      } 
}

Open in new window


Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40634947
Sorry, how to adjust the following
		for (cnt = 0; cnt<1000000; cnt++)
		{
			cnt_val++;
			try
			{
				item val = { 0 };
				int j;
				for (j = 0; j<20; j++)
				{
					val.name[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.number = cnt_val;
				strcpy(val.description, val.name);
				strcat(val.description, " ");
				strcat(val.description, val.name);
				strcat(val.description, " ");
				strcat(val.description, val.name);

				//strcat(val.i4_val, "????");

				records.insert(val);
				...

Open in new window

to use createItemName, and fouritems event instead?
0
 
LVL 32

Assisted Solution

by:sarabande
sarabande earned 500 total points
ID: 40635013
as told we would not sort the item records anymore. therefore we don't need a set 'records' and we also don't need a 1 million records small file.

so best would be to start with a brand-new main function like that

std::string createFilename(const std::string & strprefix, const std::string & strext, int filenum = -1)
{
         if (filenum < 0)
             return strprefix + strext;
         std::ostringstream oss;
         oss << strprefix << "_" << filenum << strext;
         return oss.str();
}

int main()
{
       std::string strdatafile = createFilename("c:\\dp4\\flout", ".dat");
       std::ofstream datafile(strdatafile.c_str(), std::ios::binary | std::ios::out);
       // check for error and log error if any. then return with error
       long long recnum = 0;
       for (int f = 0; f < 80; ++f)
       {
            std::string strnameindexfile = createFilename("c:\\dp4\\flname", ".idx", f+1);
            std::ofstream namefile(strnameindexfile.c_str(), std::ios::binary | std::ios::out);
            // check for error and log error if any. then return with error
            std::string strnumberindexfile = createFilename("c:\\dp4\\flnumber", ".idx", f+1);
            std::ofstream numberfile(strnumberindexfile.c_str(), std::ios::binary | std::ios::out);
            // check for error and log error if any. then return with error
            std::set<index_name> nameset;
            std::set<index_number> numberset;
            for (size_t n = 0; n < 1000000; ++n, ++recnum)
            {
                 fouritems record(true);   // create filled record
                 if (!datafile.write((char*)&record, sizeof(record))
                 {
                       // log error and return
                 }
                 for (int j = 0; j < 4; ++j)
                 {
                     index_name idxnam;
                     strcpy_s( idxnam.name, 21, record.items[j].name);
                     idxnam.recnum = recnum;
                     nameset.insert(idxnam);
                     index_number idxnum;
                     idxnum.number =  record.items[j].number;
                     numberset.insert[idxnum];
                  }
            }
            std::set<index_name>::iterator inam;
            for (inam = nameset.begin(); inam != nameset.end(); ++inam)
            {
                  index_name nam = *inam;
                   if (!namefile.write((char*)&nam, sizeof(nam))
                   {
                         // log error and return with error code
                   }
            }
            // repeat all the above for numberset
            ...
            namefile.close();
            numberfile.close();
      }
      datafile.close();
      // here we have 1 datafile and 80 index files for each index
      // we then would merge the index files for names to a big name index file
      // ... and merge the 80 number index files to a big number index file as well
      ...
      return 0;
}

Open in new window

 

note, i didn't compile the above. it might have some typing errors.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40636904
Many many thanks Sara.
I've already created the small files for both name and number. what should be adjusted below

		std::ostringstream flname;
		flname << "c:\\dp4b\\flname" << f << ".idx";
		std::string strflname = flname.str();
		filesname[f].open(strflname.c_str(), std::ios::binary | std::ios::out);
		if (!filesname[f].is_open()) return errno;
		...
	std::ifstream inputfilesnam[20];
	item names[20] = { 0 };
	bool eof_reached[20] = { false };
	long long num = 20;   //
	for (int f = 0; f < 20; ++f)
	{
		std::ostringstream flname;
		flname << "c:\\dp4b\\flname" << f << ".idx";
		inputfilesnam[f].open(flname.str().c_str(), std::ios::binary | std::ios::in);
		if (!inputfilesnam[f].is_open())
			return -3; //S
		if (!inputfilesnam[f].read((char*)&names[f], sizeof(item)))
			return -4; //
	}

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 20; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].name < name_min)
			{
				name_min = names[n].name;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		filesoutnam.write((char*)&names[n_min], sizeof(index_name));
		if (!inputfilesnam[n_min].read((char*)&names[n_min], sizeof(index_name)))
		{
			eof_reached[n_min] = true;
			inputfilesnam[n_min].close();
		}
		if (!filesoutnam.write((char*)&names[n_min], sizeof(index_name)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].name << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  name = " << names[n_min].name << "std::endl";
		}
	}
	filesoutnam.close();

Open in new window


when I'm to create the big file to name?
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40636905
I mean to create the big idx file to name, in above.
0
 
LVL 32

Assisted Solution

by:sarabande
sarabande earned 500 total points
ID: 40637441
i dropped the baseclass base_index as it makes more troubles than helped.

then the structures are like

...
#include <vector>
const unsigned int NUM_FILES   = 80;
const unsigned int NUM_RECORDS = 1000000;

struct index_name
{
    unsigned int recnum;
    char name[21];
    bool operator< (const index_name & ni) const
    {
        if (strcmp(name, ni.name) < 0) return true;
        return false;
    } 
};

struct index_number
{
    unsigned int recnum;
    long long number;
    bool operator< (const index_number & ni) const
    {
        if (number < ni.number) return true;
        return false;
    }
};

struct item
{
    char name[21];
    long long number;
    char description[100];
    bool operator<(const item & it) const
    {
        return (strcmp(name, it.name) < 0);
    }
};

struct fouritems
{
    item items[4];
    fouritems(bool bfill = false); 
};

fouritems::fouritems(bool bfill)
{
    memset(this, 0, sizeof(fouritems));
    if (bfill == true)
    {
        for (int j = 0; j<4; ++j)
        {
            createItemName(items[j].name, 21);
            items[j].number = createItemNumber();
            strcpy_s(items[j].description, 99, createItemText(99).c_str());
        }
    } 
}

Open in new window


instead of using a baseclass i i used a template function for the merge:

template <class index>
int mergeFiles(std::ofstream & outputfile, std::ifstream inputfiles[])
{   
    index indexarr[NUM_FILES]   = { 0 };    
    int   recordsize = sizeof(index);
    std::vector<bool>  eofreached(NUM_FILES, false); 
    for (int i = 0; i < NUM_FILES; ++i)
    {
        if (!inputfiles[i].read((char*)&indexarr[i], recordsize))
        {
            // log error message
            return errno;    // empty file or read error     
        }
    }
    while (true)
    {
        int n_min = 0;
        for (int n = 1; n < NUM_FILES; ++n)
        {
            if (eofreached[n] == true)
                continue;
            if (eofreached[n_min] == true || indexarr[n] < indexarr[n_min])
            {
                n_min = n;
            }
        }
        if (eofreached[n_min] == true)
            break;
        outputfile.write((char*)&indexarr[n_min], recordsize);
        if (!inputfiles[n_min].read((char*)(&indexarr[n_min]), recordsize))
        {
            inputfiles[n_min].close();
            eofreached[n_min] = true;
        }
    }
    outputfile.close();
    return 0;
}

Open in new window


the rest then is quite straight forward:

std::string createFilename(const std::string & strprefix, const std::string & strext, int filenum = -1)
{
    if (filenum < 0)
        return strprefix + strext;
    std::ostringstream oss;
    oss << strprefix << "_" << filenum << strext;
    return oss.str();
}

int main()
{
    std::string strdatafile = createFilename("c:\\dp4\\flout", ".dat");
    std::ofstream datafile(strdatafile.c_str(), std::ios::binary | std::ios::out);
    // check for error and log error if any. then return with error
    unsigned int recnum = 0;
    for (int f = 0; f < NUM_FILES; ++f)
    {
        std::string strnameindexfile = createFilename("c:\\dp4\\flname", ".idx", f+1);
        std::ofstream namefile(strnameindexfile.c_str(), std::ios::binary | std::ios::out);
        // check for error and log error if any. then return with error
        std::string strnumberindexfile = createFilename("c:\\dp4\\flnumber", ".idx", f+1);
        std::ofstream numberfile(strnumberindexfile.c_str(), std::ios::binary | std::ios::out);
        // check for error and log error if any. then return with error
        std::set<index_name> nameset;
        std::set<index_number> numberset;
        for (size_t n = 0; n < NUM_RECORDS; ++n, ++recnum)
        {
            fouritems record(true);   // create filled record
            if (!datafile.write((char*)&record, sizeof(record)))
            {
                // log error and return
                return errno;
            }
            for (int j = 0; j < 4; ++j)
            {
                index_name idxnam;
                strcpy_s(idxnam.name, 21, record.items[j].name);
                idxnam.recnum = recnum;
                nameset.insert(idxnam);
                index_number idxnum;
                idxnum.number =  record.items[j].number;
                idxnum.recnum = recnum;
                numberset.insert(idxnum);
            }
        }
        std::set<index_name>::iterator inam;
        for (inam = nameset.begin(); inam != nameset.end(); ++inam)
        {
            index_name nam = (*inam);
            if (!namefile.write((char*)(&nam), sizeof(nam)))
            {
                // log error and return with error code
                return errno;
            }
        }
        // repeat all the above for numberset
        std::set<index_number>::iterator inum;
        for (inum = numberset.begin(); inum != numberset.end(); ++inum)
        {
            index_number num = *inum;
            if (!numberfile.write((char*)(&num), sizeof(index_number)))
            {
                // log error and return with error code
                return errno;
            }
        }
        namefile.close();
        numberfile.close();
    }

    datafile.close();
    // here we have 1 datafile and NUM_FILES index files for each index
    // we then would merge the index files for names to a big name index file
    // ... and merge the NUM_FILES number index files to a big number index file as well
    std::ifstream namefiles[NUM_FILES];
    std::ifstream numberfiles[NUM_FILES];
    for (int i = 0; i < NUM_FILES; i++)
    {
        std::string strnamefile = createFilename("c:\\dp4\\flname", ".idx", i+1);
        namefiles[i].open(strnamefile.c_str(), std::ios::binary | std::ios::in);
        std::string strnumberfile = createFilename("c:\\dp4\\flnumber", ".idx", i+1);
        numberfiles[i].open(strnumberfile.c_str(), std::ios::binary | std::ios::in);
    }
    std::string strnameindexfile = createFilename("c:\\dp4\\flname", ".idx");  
    std::ofstream nameindexfile(strnameindexfile.c_str(), std::ios::binary | std::ios::out);
    std::string strnumberindexfile = createFilename("c:\\dp4\\flnumber", ".idx");  
    std::ofstream numberindexfile(strnumberindexfile.c_str(), std::ios::binary | std::ios::out);

   // here we call the mergeFiles template for name_index and number_index
    int ret;
    if ((ret = mergeFiles<index_name>(nameindexfile, namefiles)) != 0)
    {
        // log error 
        return ret;
    }
    if ((ret = mergeFiles<index_number>(numberindexfile, numberfiles)) != 0)
    {
        // log error 
        return ret;
    }
    return 0;
}

Open in new window


note, i used packed structures because you have some odd-sized members.

you can make all structures packed on 1 byte boundaries by making appropriate setting at

properties - configuration properties - c/c++ - code generation - struct member alignment = 1 byte (/zp1)

you have to do it for debug and release configuration.

Sara
0
 
LVL 32

Assisted Solution

by:sarabande
sarabande earned 500 total points
ID: 40638168
you better should use the code i posted. it is working code and started with a new main function rather than keeping old (and wrong) stuff whcih causes the problems.

std::set<item> records_w3;
std::set<index_name> nameset;
std::set<index_number> numbset;
the main issue you encounter is because you defined the 3 sets above all loops. the first set is not needed anymore because we are writing the item records sequentially to data file. both the other set should be created in the f-loop. then you would get automatically a new set for each small file while currently you "add" the next 4 million keys to the one and only set you have for each index.

the next differences are that you used recnum as a long long while i used an unsigned int (what is good enough for 3 billion item records what is 16 billion names).

then you should use createItemnumber and createItemText for number creation and creating of descriptions. you were creating a number with rand() which returns numbers from 0 to 65535 (short int). it makes not really sense to using 16-bit number values for 320 million keys where you would get a lot of duplicates what makes troubles if you search for numbers later.

if you would use the structures i posted and the main function and add the missing createItemName, createItemNumber, createItemText and some error handling and logging, you have working and good code.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40639024
Thanks Sara.
I did remove the parts of "records_w" below
// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

const unsigned int NUM_FILES = 20;
const unsigned int NUM_RECORDS = 1000000;

struct index_name
{
	long long recnum;
	char name[21];
	bool operator< (const index_name & ni) const
	{
		if (strcmp(name, ni.name) < 0) return true;
		return false;
	}
};

struct index_number
{
	long long recnum;
	long long number;
	bool operator< (const index_number & ni) const
	{
		if (number < ni.number) return true;
		return false;
	}
}; 

struct item
{
	char name[21];
	long long number;
	char description[100];
	int  get_len() const
	{
		int len = strlen(name);
		if (len < (int)sizeof(name))
			return len;
		return (int)sizeof(name);
	}
	void get_uni_nm(wchar_t nm_uni[], int sizfld) const
	{
		int len = get_len();
		if (len > sizfld)
			len = sizfld;
		mbstowcs(nm_uni, name, len);
	}
	bool operator< (const item & ni) const
	{
		if (strcmp(name, ni.name) < 0) return true;
		if (strcmp(name, ni.name) > 0) return false;
		if (number< ni.number) return true;
		return false;
	}
};

using namespace std;

void createItemName(char name[], int sizname)
{
	for (int j = 0; j<sizname - 1; j++)
	{
		name[j] = (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
	}
	name[sizname - 1] = '\0';
}

struct fouritems
{
	item items[4];
	fouritems(bool bfill = false);

};

fouritems::fouritems(bool bfill)
{
	memset(this, 0, sizeof(fouritems));
	char str1[100];
	if (bfill == true)
	{
		for (int j = 0; j<4; ++j)
		{
			createItemName(items[j].name, 21);
			items[j].number = std::rand() % NUM_RECORDS;
			strcpy(str1, items[j].name);
			strcat(str1, " ");
			strcat(str1, items[j].name);
			strcat(str1, " ");
			strcat(str1, items[j].name);
			strcpy_s(items[j].description, 99, str1);
		}
	}
}

template <class index>
int mergeFiles(std::ofstream & outputfile, std::ifstream inputfiles[])
{
	index indexarr[NUM_FILES] = { 0 };
	int   recordsize = sizeof(index);
	std::vector<bool>  eofreached(NUM_FILES, false);
	for (int i = 0; i < NUM_FILES; ++i)
	{
		if (!inputfiles[i].read((char*)&indexarr[i], recordsize))
		{
			// log error message
			return errno;    // empty file or read error     
		}
	}
	while (true)
	{
		int n_min = 0;
		for (int n = 1; n < NUM_FILES; ++n)
		{
			if (eofreached[n] == true)
				continue;
			if (eofreached[n_min] == true || indexarr[n] < indexarr[n_min])
			{
				n_min = n;
			}
		}
		if (eofreached[n_min] == true)
			break;
		outputfile.write((char*)&indexarr[n_min], recordsize);
		if (!inputfiles[n_min].read((char*)(&indexarr[n_min]), recordsize))
		{
			inputfiles[n_min].close();
			eofreached[n_min] = true;
		}
	}
	outputfile.close();
	return 0;
}

int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long recnum = 0;
	long long cnt_val = 0;
	std::ofstream files[NUM_FILES];
	std::ofstream filesname[NUM_FILES];
	std::ofstream filesnumb[NUM_FILES];
	std::ofstream flname;
	std::ofstream flnumb;
	std::ofstream file_w;
	//std::set<item> records_w;
	std::ostringstream flout_w;
	flout_w << "c:\\dp4b\\flout_w.dat";
	std::string strflout_w = flout_w.str();
	file_w.open(strflout_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;

	//std::set<item> records_w3;
	std::set<index_name> nameset;
	std::set<index_number> numbset;

	for (int f = 0; f < NUM_FILES; ++f)
	{
		std::set<item> records;
		srand((int)time(NULL));

		std::ostringstream flname;
		flname << "c:\\dp4b\\flname" << f << ".idx";
		std::string strflname = flname.str();
		filesname[f].open(strflname.c_str(), std::ios::binary | std::ios::out);
		if (!filesname[f].is_open()) return errno;

		std::ostringstream flnumb;
		flnumb << "c:\\dp4b\\flnumb" << f << ".idx";
		std::string strflnumb = flnumb.str();
		filesnumb[f].open(strflnumb.c_str(), std::ios::binary | std::ios::out);
		if (!filesnumb[f].is_open()) return errno;

		for (cnt = 0; cnt<NUM_RECORDS; cnt++)
		{
			recnum++;
			fouritems record(true);
			try
			{
				file_w.write((char *)&record, sizeof(record));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}

			for (int j = 0; j < 4; j++)
			{
				index_name idxname;
				strcpy_s(idxname.name, 21, record.items[j].name);
				idxname.recnum = recnum;
				nameset.insert(idxname);

				index_number idxnumb;
				idxnumb.number = record.items[j].number;
				numbset.insert(idxnumb);

			}

		}
		for (std::set<index_name>::iterator iname = nameset.begin(); iname != nameset.end(); ++iname)
		{
			try
			{
				index_name nam = *iname;
				filesname[f].write((char *)&nam, sizeof(nam));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}
		for (std::set<index_number>::iterator inumb = numbset.begin(); inumb != numbset.end(); ++inumb)
		{
			try
			{
				index_number num = *inumb;
				filesnumb[f].write((char *)&num, sizeof(num));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}

		filesname[f].close();
		filesnumb[f].close();
	}

	// here we have 1 datafile and NUM_FILES index files for each index
	// we then would merge the index files for names to a big name index file
	// ... and merge the NUM_FILES number index files to a big number index file as well
	std::ifstream namefiles[NUM_FILES];
	std::ifstream numberfiles[NUM_FILES];
	for (int i = 0; i < NUM_FILES; i++)
	{
		//std::string strnamefile = createFilename("c:\\dp4\\flname", ".idx", i + 1);
		std::ostringstream flname2;
		flname2 << "c:\\dp4b\\flname" << i << ".idx";
		std::string strnamefile = flname2.str();
		namefiles[i].open(strnamefile.c_str(), std::ios::binary | std::ios::in);

		std::ostringstream flnumb2;
		flnumb2 << "c:\\dp4b\\flnumb" << i << ".idx";
		std::string strnumberfile = flnumb2.str();
		numberfiles[i].open(strnumberfile.c_str(), std::ios::binary | std::ios::in);
	}

	std::ostringstream flname_w;
	flname_w << "c:\\dp4b\\flname_w.idx";
	std::string strnameindexfile = flname_w.str();
	std::ofstream nameindexfile(strnameindexfile.c_str(), std::ios::binary | std::ios::out);

	std::ostringstream flnumb_w;
	flnumb_w << "c:\\dp4b\\flnumb_w.idx";
	std::string strnumberindexfile = flnumb_w.str();
	std::ofstream numberindexfile(strnumberindexfile.c_str(), std::ios::binary | std::ios::out);

	// here we call the mergeFiles template for name_index and number_index
	int ret;
	if ((ret = mergeFiles<index_name>(nameindexfile, namefiles)) != 0)
	{
		// log error 
		return ret;
	}
	if ((ret = mergeFiles<index_number>(numberindexfile, numberfiles)) != 0)
	{
		// log error 
		return ret;
	}
	file_w.close();

	return 0;
}

Open in new window

but the files created are still in increasing size like what I mentioned yesterday. why?
0
 
LVL 32

Accepted Solution

by:
sarabande earned 500 total points
ID: 40639069
as told both the std::set for index_name and index_number must be moved into the f-loop. or you would need to clear the sets after writing to file.

you still are not using my code but yours. beside of the 'records_w' there are a lot of redundant and wrong code, and it makes little to no sense to discuss all the wrong or needless statements again since i already have done it a multiple times.

below is working code which only has to be enhanced by a reasonable error handling.

you should do that such that we could move to the two more interesting parts:
- saving space by moving the variable texts to a separate text file
- doing a binary search on either name index or number index

#include <sys/stat.h>
#include <errno.h>
#include <fstream>
#include <string>
#include <sstream>
#include <iostream>
#include <algorithm>
#include <iomanip>
#include <set>

const unsigned int NUM_FILES   = 20;
const unsigned int NUM_RECORDS = 10000000;

int randindex(int maxrand, int off = 0)
{
    return (rand()%maxrand)+off;
}

std::string createItemText(int maxsize)
{
    static std::string vocals ="aeiouy";
    static std::string consonants = "bcdfghjklmnpqrstxz";
    static std::string prevocals[] = { "bl", "br", "ch", "cl", "cr", "dr", "dw", "fl", "fr", "gl", "gh", "gr", 
        "kl", "kn", "kr", "pl", "pr", "qu", "rh", "sch", "sh", "sp", "sl", "sk", 
        "st", "str", "spr", "th", "tr", "wh", "wr", };
    static std::string postvocals[] = { "ch", "nd", "rd", "nt", "rt", "rp", "rch", "sh", "wn", "rg", "rg","rl","th", "tt", "ss"};
    static int NV = vocals.length();                            
    static int NC = consonants.length();                            
    static int NPRE = sizeof(prevocals)/sizeof(prevocals[0]);

    static int NPOST = sizeof(postvocals)/sizeof(postvocals[0]);
    static int NW = 6;
    static int NL   = 12;
    static int NS   = 4;

    int nw = randindex(NW, 2);    // words of sentence
    std::string strspace;
    std::ostringstream os;
    while (--nw >= 0)
    {
        os << strspace;
        strspace = " ";
        int ns = randindex(NS, 1); // syllables
        bool bpre = (randindex(2) == 0);
        bool bpost = false;
        bool bvoc = false;
        while (--ns >= 0)
        {
            if (bpre == true)
            { 
                bpre = false;
                int np = randindex(NPRE);
                os << prevocals[np];
                if (ns == 0) ns = randindex(2, 1); // use at least 2 syllables
                bvoc = true;
            }
            else if (bvoc == true)
            { 
                bvoc = false;                     
                bpost = true;
                int nv = randindex(NV);
                if (ns == 0) ns = randindex(2); // at least 50 percent should not end with a vocal
                os << vocals[nv];
            }
            else if (bpost == false || (randindex(2)==0))  // 50 percent use a single consonant after a vocal 
            {
                bvoc = true;
                int nc = randindex(NC);
                if (bpost == false) ns += randindex(2, 1);  
                os <<consonants[nc];
            }
            else 
            {
                bpost = false;
                int np = randindex(NPOST);
                os << postvocals[np];
                bvoc = true;
            }
        }        
    }
    return os.str().substr(0, maxsize);
}       

long long createItemNumber()
{
    long long number1 = rand() + 12345;
    long long number2 = rand() + 98765;
    return (number1*(number2+1)*(number1+number2+77777));
}

#include <vector>

struct index_name
{
    unsigned int recnum;
    char name[21];
    bool operator< (const index_name & ni) const
    {
        if (strcmp(name, ni.name) < 0) return true;
        return false;
    } 
};

struct index_number
{
    unsigned int recnum;
    long long number;
    bool operator< (const index_number & ni) const
    {
        if (number < ni.number) return true;
        return false;
    }
};

void createItemName(char name[], int sizname)
{
    for (int j = 0; j<sizname-1; j++)
    {
        name[j] = (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
    }
    name[sizname-1] = '\0';  
}

struct item
{
    char name[21];
    long long number;
    char description[100];
    bool operator<(const item & it) const
    {
        return (strcmp(name, it.name) < 0);
    }
};

struct fouritems
{
    item items[4];
    fouritems(bool bfill = false); 

};

fouritems::fouritems(bool bfill)
{
    memset(this, 0, sizeof(fouritems));
    if (bfill == true)
    {
        for (int j = 0; j<4; ++j)
        {
            createItemName(items[j].name, 21);
            items[j].number = createItemNumber();
            strcpy_s(items[j].description, 99, createItemText(99).c_str());
        }
    } 
}

template <class index>
int mergeFiles(std::ofstream & outputfile, std::ifstream inputfiles[])
{   
    index indexarr[NUM_FILES]   = { 0 };    
    int   recordsize = sizeof(index);
    std::vector<bool>  eofreached(NUM_FILES, false); 
    for (int i = 0; i < NUM_FILES; ++i)
    {
        if (!inputfiles[i].read((char*)&indexarr[i], recordsize))
        {
            // log error message
            return errno;    // empty file or read error     
        }
    }
    while (true)
    {
        int n_min = 0;
        for (int n = 1; n < NUM_FILES; ++n)
        {
            if (eofreached[n] == true)
                continue;
            if (eofreached[n_min] == true || indexarr[n] < indexarr[n_min])
            {
                n_min = n;
            }
        }
        if (eofreached[n_min] == true)
            break;
        outputfile.write((char*)&indexarr[n_min], recordsize);
        if (!inputfiles[n_min].read((char*)(&indexarr[n_min]), recordsize))
        {
            inputfiles[n_min].close();
            eofreached[n_min] = true;
        }
    }
    outputfile.close();
    return 0;
}

std::string createFilename(const std::string & strprefix, const std::string & strext, int filenum = -1)
{
    if (filenum < 0)
        return strprefix + strext;
    std::ostringstream oss;
    oss << strprefix << "_" << filenum << strext;
    return oss.str();
}

int main()
{
    std::string strdatafile = createFilename("c:\\dp4\\flout", ".dat");
    std::ofstream datafile(strdatafile.c_str(), std::ios::binary | std::ios::out);
    // check for error and log error if any. then return with error
    unsigned int recnum = 0;
    for (int f = 0; f < NUM_FILES; ++f)
    {
        std::string strnameindexfile = createFilename("c:\\dp4\\flname", ".idx", f+1);
        std::ofstream namefile(strnameindexfile.c_str(), std::ios::binary | std::ios::out);
        // check for error and log error if any. then return with error
        std::string strnumberindexfile = createFilename("c:\\dp4\\flnumber", ".idx", f+1);
        std::ofstream numberfile(strnumberindexfile.c_str(), std::ios::binary | std::ios::out);
        // check for error and log error if any. then return with error
        std::set<index_name> nameset;
        std::set<index_number> numberset;
        for (size_t n = 0; n < NUM_RECORDS; ++n, ++recnum)
        {
            fouritems record(true);   // create filled record
            if (!datafile.write((char*)&record, sizeof(record)))
            {
                // log error and return
                return errno;
            }
            for (int j = 0; j < 4; ++j)
            {
                index_name idxnam;
                strcpy_s(idxnam.name, 21, record.items[j].name);
                idxnam.recnum = recnum;
                nameset.insert(idxnam);
                index_number idxnum;
                idxnum.number =  record.items[j].number;
                idxnum.recnum = recnum;
                numberset.insert(idxnum);
            }
        }
        std::set<index_name>::iterator inam;
        for (inam = nameset.begin(); inam != nameset.end(); ++inam)
        {
            index_name nam = (*inam);
            if (!namefile.write((char*)(&nam), sizeof(nam)))
            {
                // log error and return with error code
                return errno;
            }
        }
        // repeat all the above for numberset
        std::set<index_number>::iterator inum;
        for (inum = numberset.begin(); inum != numberset.end(); ++inum)
        {
            index_number num = *inum;
            if (!numberfile.write((char*)(&num), sizeof(index_number)))
            {
                // log error and return with error code
                return errno;
            }
        }
        namefile.close();
        numberfile.close();
    }

    datafile.close();
    // here we have 1 datafile and NUM_FILES index files for each index
    // we then would merge the index files for names to a big name index file
    // ... and merge the NUM_FILES number index files to a big number index file as well
    std::ifstream namefiles[NUM_FILES];
    std::ifstream numberfiles[NUM_FILES];
    for (int i = 0; i < NUM_FILES; i++)
    {
        std::string strnamefile = createFilename("c:\\dp4\\flname", ".idx", i+1);
        namefiles[i].open(strnamefile.c_str(), std::ios::binary | std::ios::in);
        std::string strnumberfile = createFilename("c:\\dp4\\flnumber", ".idx", i+1);
        numberfiles[i].open(strnumberfile.c_str(), std::ios::binary | std::ios::in);
    }
    std::string strnameindexfile = createFilename("c:\\dp4\\flname", ".idx");  
    std::ofstream nameindexfile(strnameindexfile.c_str(), std::ios::binary | std::ios::out);
    std::string strnumberindexfile = createFilename("c:\\dp4\\flnumber", ".idx");  
    std::ofstream numberindexfile(strnumberindexfile.c_str(), std::ios::binary | std::ios::out);
    int ret;
    if ((ret = mergeFiles<index_name>(nameindexfile, namefiles)) != 0)
    {
        // log error 
        return ret;
    }
    if ((ret = mergeFiles<index_number>(numberindexfile, numberfiles)) != 0)
    {
        // log error 
        return ret;
    }
    return 0;
}

Open in new window


Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40639104
Thanks a lot.
I run your current codes. But it does slowly create the relevant files. why?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40639117
const unsigned int NUM_RECORDS = 10000000;
i made a mistake and used 10 million instead of 1 million

it creates 4 names and  4 number indices per record what is 800 million index records.

you may change to NUM_FILES = 100 and NUM_RECORDS = 500000 what should speed-up the file creation.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40644688
Sara,
I really appreciate you a lot.

I increase the number of files to 80 and the process does spend a few hours to finish generating all files and I feel that it is a little bit slower than the old process which is not to search by several name/number. Do you think the speed is acceptable?
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40644825
80 files and 1 million records means that your program writes 80 million items records, and 2 x 320 million = 640 million name index records (both small files and big file) and 640 million number index records.

that is 41 GB for data file,  18 GB for name index files and 10 GB for number index files, what is about 70 GB in total.

if you currently need 4 hours (what is about 14,000 seconds) it is 5 MB/sec or 40 mbit/s what seems not so bad and probably could be improved by factor 5 - 10 when using an ssd drive. if you would use a normal disk which is fairly empty and not the same disk where the operation system and your program is running on, you also should/could improve the figures by factor 2 at least.

programmatically you could write the files in bigger chunks and not record by record. for example you could create an array of 1000 fouritems records and fill that array rather than writing each record to the data file. then if the array was full you write 1000 x sizeof(fouritems) bytes to file with one call. by doing so, you have 80 thousand writes instead of 80 million what should be much faster. same thing could be done for the index files (you may use the same array for both the small files and the big file).

last thing you could do is to write the description texts packed to a further text data file. if we assume that the middle length of the description is only a third of the maximum size, you would spare at least the half or the data file's size and have (only) 20 GB for both the data file and the text data file what is about 30 percent less of total size.

Sara
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40646014
Thanks a lot Sara.

I see flname file is 8.750 GB below
https://dl.dropboxusercontent.com/u/40211031/tt5.png

while flnumber file is 133,440 KB
https://dl.dropboxusercontent.com/u/40211031/tt4.png

do you think there can be anything wrong with flnumber file?
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40646296
Sorry, please omit the 2nd screenshot, as real size of flnumber should be about 509 MB and I'll upload the file for you to see it, if possible.
0
 
LVL 10

Author Comment

by:HuaMinChen
ID: 40646344
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40646385
do you think there can be anything wrong with flnumber file?
yes. I assume there are too many duplicate numbers created which would reduce the total number of numbers in the std::set. if that is the case the small files would have different size and don't contain 4 million numbers but much less. you also may output the size() of the std::set before they were written to file. the goal should be that it was 4 million.  

do you use the createItemNumber function I posted? if no, then the issue is clear because the number you created with your algorithm creates a lot of duplicates. if yes, you should add some more summands to the expression in the createItemNumber function multiplied by (rand()+x) where x is some odd offset number such that the numbers it created are more likely to be unique as they are now. another way is to check the number before inserting it to the set like

index_number new_index = { 0 };
new_index.recnum = recnum;

do
{
    new_index.number = createItemNumber();
} 
while (number_set.find(new_index) != number_set.end());

number_set.insert(new_index);

Open in new window

the above would guarantee that each number is unique. but it is dangerous: if the createItemNumber would not return at least 1 million different numbers the above loop is infinite.

Sara

p. s. I can't look into dropbox before tomorrow.
0
 
LVL 32

Expert Comment

by:sarabande
ID: 40646418
long long createItemNumber()
{
    // the maximum signed 64 bit integer is 9223372036854775808 and has 19 digits

    // in order to not create an overflow (what gives a negative number) we make the highest digit between 0 and 8
    long long number = rand()%9;

    // the loop creates a decimal number which has between 14-19 digits.
    for (int n = 0; n < (13+(rand()%6)); ++n)
           number = (number*10) + (rand()%10);
    return number;
}

Open in new window


Sara
0

Featured Post

Top 6 Sources for Identifying Threat Actor TTPs

Understanding your enemy is essential. These six sources will help you identify the most popular threat actor tactics, techniques, and procedures (TTPs).

Join & Write a Comment

This article shows you how to optimize memory allocations in C++ using placement new. Applicable especially to usecases dealing with creation of large number of objects. A brief on problem: Lets take example problem for simplicity: - I have a G…
Ever notice how you can't use a new drive in Windows without having Windows assigning a Disk Signature?  Ever have a signature collision problem (especially with Virtual Machines?)  This article is intended to help you understand what's going on and…
Video by: Grant
The goal of this video is to provide viewers with basic examples to understand and use while-loops in the C programming language.
The viewer will learn how to use the return statement in functions in C++. The video will also teach the user how to pass data to a function and have the function return data back for further processing.

744 members asked questions and received personalized solutions in the past 7 days.

Join the community of 500,000 technology professionals and ask your questions.

Join & Ask a Question

Need Help in Real-Time?

Connect with top rated Experts

14 Experts available now in Live!

Get 1:1 Help Now