asked on

Change to the struct

Hi,
I have struct like

// struc3.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     int  get_len() const
     {
         int len = strlen(item1);
         if (len < (int)sizeof(item1))
             return len;
         return (int)sizeof(item1);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item1, len);
     }
     bool operator< (const struc3 & a2) const
     {
           if(strcmp(item1, a2.item1) < 0) return true;
           if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
           return false;
     }
};

#endif

Open in new window

here is the relevant project

// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include "..\..\include\struc3.h"   
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

using namespace std;
struc3 binrec;
int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long cnt_val = 0;
	std::ofstream files[80];
	std::ofstream file_w;
	std::set<struc3> records_w;
	std::ostringstream filename_w;
	filename_w << "c:\\dp4b\\flout_w.bin";
	std::string strfilename_w = filename_w.str();
	file_w.open(strfilename_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;
	for (int f = 0; f < 80; ++f)
	{
		std::set<struc3> records;
		srand((int)time(NULL));
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		std::string strfilename = filename.str();
		files[f].open(strfilename.c_str(), std::ios::binary | std::ios::out);
		if (!files[f].is_open()) return errno;
		for (cnt = 0; cnt<1000000; cnt++)
		{
			cnt_val++;
			try
			{
				struc3 val = { 0 };
				int j;
				for (j = 0; j<20; j++)
				{
					val.item1[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i1_int = cnt_val;
				strcpy(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				records.insert(val);
				//records_w.insert(val);
				if ((cnt + 1) % 10000 == 0)
				{
					std::cout << val.i1_int << " | " << val.item1 << std::endl;
				}
			}
			catch (exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}
		for (std::set<struc3>::iterator it = records.begin(); it != records.end(); ++it)
		{
			try
			{
				files[f].write((char *)&(*it), sizeof(struc3));
				//file_w.write((char *)&(*it), sizeof(struc3));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}

		records.clear();
		files[f].close();  //
	}
	std::ifstream inputfiles[80];
	struc3 names[80] = { 0 };
	bool eof_reached[80] = { false };
	long long num = 80;   //
	for (int f = 0; f < 80; ++f)
	{
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		inputfiles[f].open(filename.str().c_str(), std::ios::binary | std::ios::in);
		if (!inputfiles[f].is_open())
			return -3; //
		if (!inputfiles[f].read((char*)&names[f], sizeof(struc3)))
			return -4; //
	}

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 80; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].item1 < name_min)
			{
				name_min = names[n].item1;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		file_w.write((char*)&names[n_min], sizeof(struc3));
		if (!inputfiles[n_min].read((char*)&names[n_min], sizeof(struc3)))
		{
			eof_reached[n_min] = true;
			inputfiles[n_min].close();
		}
		if (!file_w.write((char*)&names[n_min], sizeof(struc3)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].item1 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  item1 = " << names[n_min].item1 << "std::endl";
		}
	}
	file_w.close();
	std::cout << "the maximum int value is " << integer_limit << std::endl;
	system("pause>null");
	return 0;
}

Open in new window

that is making use of the struct in above. To the struct, operator< is used for sorting purpose, supposing that I've also filled both item2 and item3 with values to the above codes, what to change to the above codes, if I sometimes, need to sort the file by item2 or item3?

ozo

bool operator< (const struc3 & a2) const
{
if(strcmp(item1, a2.item1) < 0) return true;
if(strcmp(item1, a2.item1) > 0) return false;
if (i1_int < a2.i1_int) return true;
if (i1_int > a2.i1_int) return false;
if(strcmp(item2, a2.item2) < 0) return true;
if(strcmp(item2, a2.item2) > 0) return false;
if (i2_int < a2.i2_int) return true;
if (i2_int > a2.i2_int) return false;
if(strcmp(item3, a2.item3) < 0) return true;
if(strcmp(item3, a2.item3) > 0) return false;
if (i3_int < a2.i3_int) return true;
//if (i3_int > a2.i3_int) return false;
return false;
}

Peter Chan

ASKER

Many thanks Ozo. Suppose that there are some items to the struct. Is there any way to each time, only choose one specific item, like item1 and i1_int, or item2 and i2_int, to sort? I mean, once I've selected one specific item, like item1, item2, then the whole file should be sorted by the specific item.

ozo

How are you making the selection?

sarabande

you may use a functor for being able to use different sort criteria on a container:

struct threeitems 
{
    std::string item1;
    std::string item2;
    int item3;
    threeitems(std::string i1, std::string i2, int i3) : item1(i1), item2(i2), item3(i3) {}
};

enum esort { enone = 0, eitem1 = 1, eitem2=2, eitem3=3, emax };
struct sort_functor
{
    int      nsort;
    esort  sortorder[3];
    bool   bascending[3];
    sort_functor(int firstitem, int seconditem = enone, int thirditem = enone)
        :  nsort(0)
    {
        sortorder[0]    = sortorder[1]     = sortorder[2]    = enone;
        bascending[0] = bascending[1] = bascending[2] = true;

        if (abs(firstitem) > enone && abs(firstitem) < emax)
        {
            ++nsort;
            sortorder[0] = (esort)(abs(firstitem));
            bascending[0] = (firstitem > 0);
            if (abs(seconditem) > enone && abs(seconditem) < emax)
            {
                ++nsort;
                sortorder[1] = (esort)(abs(seconditem));
                bascending[1] = (seconditem > 0);
                if (abs(thirditem) > enone && abs(thirditem) < emax)
                {
                    ++nsort;
                    sortorder[2] = (esort)(abs(thirditem));
                    bascending[2] = (thirditem > 0);
                }
            }
        }
    }               
    bool operator()(const threeitems & ti1, const threeitems & ti2)
    {
        for (int i = 0; i < nsort; ++i)
        {
            bool bless = false;
            switch(sortorder[i])
            {
            case eitem1:
                {
                    
                    bless = (bascending[i] == true)? (ti1.item1 < ti2.item1) :  (ti2.item1 < ti1.item1) ;
                    break;
                }
            case eitem2:
                {
                    bless = (bascending[i] == true)? (ti1.item2 < ti2.item2) :  (ti2.item2 < ti1.item2) ;
                    break;
                }
            case eitem3:
                {
                    bless = (bascending[i] == true)? (ti1.item3 < ti2.item3) :  (ti2.item3 < ti1.item3) ;
                    break;
                }
            }
            if (bless == true)
                return true;
            switch(sortorder[i])
            {
            case eitem1:
                {
                    if (ti1.item1 == ti2.item1)
                        continue;
                    break;
                }
            case eitem2:
                {
                    if  (ti1.item2 == ti2.item2) 
                        continue;
                    break;
                }
            case eitem3:
                {
                    if (ti1.item3 == ti2.item3) 
                        continue;
                    break;
                }
            }
        }
        return false;
    }
};

void mysort()
{
    std::vector<threeitems> tiarr;
    tiarr.push_back(threeitems("abc", "xyz", 123));
    tiarr.push_back(threeitems("aaa", "bbb", 999));
    tiarr.push_back(threeitems("zzz", "yyy", 111));

    std::sort(tiarr.begin(), tiarr.end(), sort_functor(1, -2, -3));
}

Open in new window

the sample sorts the container by first item in ascending order, then (in case of duplicates) by second item in descending order and finally by third item in descending order.

the sort_functor works like a function pointer but has the advantage that you could parameterize it unlike to a function pointer. if you would use sort_functor(3) it would only use item3 in ascending order.

a few things you have to consider if you think on using a functor for the nameval structure:

- you currently were using a std::set and operator< of struct nameval. but std::set cannot be used with a functor.
unlike to std::sort function the sort criteria is a property of the set and may not be a matter of change for the set.
- if you want to use a functor you should change the container to std::vector.
then you would push_back the records to the vector unsorted and then sort it by using the functor.
i would guess, it is slower than by using the std::set and the built-in less operator but i might be wrong.
- in no case you could 'sort a file' by that. you always have to sort in memory and then write a new file.
- because of that you cannot sort the huge file as it is too big to load it to memory.
- but you can sort the smaller containers before you write them to disk.
- of course all files have to be sorted by using the same sort criteria.
- and, the query program also must use the same criteria for the binary search.
- for example if the main search criteria is the number value, your binary search
would compare numbers and not strings as it does now.

so actually i don't see much benefit in using different sort criterias, beside you want to provide two huge files, where one is sorted by name and the second is sorted by number.

Sara

Peter Chan

ASKER

Many many thanks Sara.
Is there no way to use set to work with functor?

If there is really no way for that, how to adjust these

// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include "..\..\include\struc3.h"   
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

using namespace std;
struc3 binrec;
int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long cnt_val = 0;
	std::ofstream files[80];
	std::ofstream file_w;
	std::set<struc3> records_w;
	std::ostringstream filename_w;
	filename_w << "c:\\dp4b\\flout_w.bin";
	std::string strfilename_w = filename_w.str();
	file_w.open(strfilename_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;
	for (int f = 0; f < 80; ++f)
	{
		std::set<struc3> records;
		srand((int)time(NULL));
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		std::string strfilename = filename.str();
		files[f].open(strfilename.c_str(), std::ios::binary | std::ios::out);
		if (!files[f].is_open()) return errno;
		for (cnt = 0; cnt<1000000; cnt++)
		{
			cnt_val++;
			try
			{
				struc3 val = { 0 };
				int j;
				for (j = 0; j<20; j++)
				{
					val.item1[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i1_int = cnt_val;
				strcpy(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				records.insert(val);
				//records_w.insert(val);
				if ((cnt + 1) % 10000 == 0)
				{
					std::cout << val.i1_int << " | " << val.item1 << std::endl;
				}
			}
			catch (exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}
		for (std::set<struc3>::iterator it = records.begin(); it != records.end(); ++it)
		{
			try
			{
				files[f].write((char *)&(*it), sizeof(struc3));
				//file_w.write((char *)&(*it), sizeof(struc3));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}

		records.clear();
		files[f].close();  //
	}
	std::ifstream inputfiles[80];
	struc3 names[80] = { 0 };
	bool eof_reached[80] = { false };
	long long num = 80;   //
	for (int f = 0; f < 80; ++f)
	{
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		inputfiles[f].open(filename.str().c_str(), std::ios::binary | std::ios::in);
		if (!inputfiles[f].is_open())
			return -3; //
		if (!inputfiles[f].read((char*)&names[f], sizeof(struc3)))
			return -4; //
	}

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 80; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].item1 < name_min)
			{
				name_min = names[n].item1;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		file_w.write((char*)&names[n_min], sizeof(struc3));
		if (!inputfiles[n_min].read((char*)&names[n_min], sizeof(struc3)))
		{
			eof_reached[n_min] = true;
			inputfiles[n_min].close();
		}
		if (!file_w.write((char*)&names[n_min], sizeof(struc3)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].item1 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  item1 = " << names[n_min].item1 << "std::endl";
		}
	}
	file_w.close();
	return 0;
}

Open in new window

to use vector instead?

Here is the struct file.

// struc3.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     int  get_len() const
     {
         int len = strlen(item1);
         if (len < (int)sizeof(item1))
             return len;
         return (int)sizeof(item1);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item1, len);
     }
     bool operator< (const struc3 & a2) const
     {
           if(strcmp(item1, a2.item1) < 0) return true;
           if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
           return false;
     }
};

#endif

Open in new window

sarabande

Is there no way to use set to work with functor?

actually, the std::set has a second template argument 'Traits' which is a functor and defaults to std::less<key_type> (what uses the operator< of the key class). the problem is that the functor classes used for the 'Traits' parameter must have a default constructor cause when defining a std::set there is no way to passing arguments to the constructor of the function but if the functor class must use the default constructor, you don't have a chance to 'configure' the sorting.

the following code

std::set<threeitems, sort_functor> tiset;
    tiset.insert(threeitems("abc", "xyz", 123));

Open in new window

would give error C2512: 'sort_functor::sort_functor' : no appropriate default constructor available' because of that.

I found a way out by the following:

struct sort_functor_item1_asc : public sort_functor
    {
        sort_functor_item1_asc() : sort_functor(eitem1) {}
    };

    std::set<threeitems, sort_functor_item1_asc> tiset1;
    tiset1.insert(threeitems("abc", "xyz", 123));

Open in new window

where you derive from the functor and so create the required default constructor.

how to adjust these to use vector instead?

you have to do 3 things:

- define 'std::vector<nameval> records' instead of 'std::set<nameval> records'.
- use push_back instead of insert
- add std::sort(records.begin(), records.end()) after inserting 1 million of records to the vector.

the last would sort the records by using operator< of struct nameval.

if you want to use a functor you may modify the sort_functor struct I posted and adjust it such that it fits to the nameval struct.

note, technically you have all possibilities with little efforts. but you need to consider what you want to achieve by making the sort criteria dynamically. as told the sort order is crucial for your second program which reads the records. if you make a change to the savebinaryfile you have to make the same change to readbinaryfile. also sorting for the number value makes not so much sense as the number is continuous anyway. if you want to sort your files by number value you don't need any sorting at all but simply could write the records sequentially to the file - what also could be the huge file as you also don't need a merge. if you would simply take your current working programs and write each record you created to a further binary output file (directly when inserting it to the std::set), then your query program could operate on two huge files of same size, one is sorted by name and one is sorted by number. if you search by number you could use the number as a record number and read directly from file ordered by number. and if the user searched by name you could use the binary search. for this there are no functors needed and your efforts are minimal.

Sara

Peter Chan

ASKER

Many thanks Sara.
I

if you want to sort your files by number value you don't need any sorting at all but simply could write the records sequentially to the file - what also could be the huge file as you also don't need a merge. if you would simply take your current working programs and write each record you created to a further binary output file (directly when inserting it to the std::set), then your query program could operate on two huge files of same size, one is sorted by name and one is sorted by number. if you search by number you could use the number as a record number and read directly from file ordered by number. and if the user searched by name you could use the binary search. for this there are no functors needed and your efforts are minimal.

Does it mean, if I have several items to sort, I then need to create several huge files, to sort, depending on that I still use set to the programs?

Peter Chan

ASKER

And the point is, better to use set, other than vector, as vector would be slowness, right?

sarabande

Does it mean, if I have several items to sort, I then need to create several huge files, to sort, depending on that I still use set to the programs?

yes. it is the same as in a database where you were using multiple index columns or combinations of index columns.

the only difference: while in a database you have two storages one for the data and one for the index (keys) you currently were using the sorted records themselves for index search. that makes sense as your records are mainly keys and therefore it is not necessary to have a separate data storage where the keys are referring to.

however, if you intend to make the number value a second index, things are different. alternatively to creating a second huge file now sorted by number, you could store records with two members such you got a mapping of the number value to the record number in the huge file where the other members are stored.

flout.bin:

[AAAABXC...., 16666666]
[AAAAFGA...,  55111511] 
....
[zzzzsabbc..., 47001234]

flout.idx:

[1, 8868686]
...
[16666666, 0]         -> points to record 0 of flout.bin
...
[47001234, 79999999] -> points to the last of 80 million records
...
[55111511, 1]    -> points to 2nd record of flout.bin

Open in new window

if your next structure has more than 2 members it would make sense to using the above design for all your indexes. that means you would not sort the records at all but simply write the records sequentially to a (huge) data file which is now your database. additionally you would create index files for each index you want to support. for example if you want to have an own index for each member, you would create an index file for each of them. index file creation would be done like your current savebinaryfile works. because of the huge amount of keys, it is not possible to hold 80 million (or more) keys in memory. the way out is, you know, to create smaller chunks of - say 1 million - index files and merge them at end of program to one huge index file. you would do that for each index directly while writing to the data file. the readbinayfile then would have to open 4 files, the data file and the 3 index files. depending on the search request it would decide which index file it has to use for the binary search. as the index file contains the index key, the binary search either would find the key or not. in the first case it would read from data file by using the record number that was stored with the found key in the index file.

Sara

Peter Chan

ASKER

Sara,
I appreciate you a lot.

To have different index file due to one specific order selected, does it mean I should have more than one struct file for that? If yes, what to adjust to this

// struc3.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];

     int  get_len() const
     {
         int len = strlen(item1);
         if (len < (int)sizeof(item1))
             return len;
         return (int)sizeof(item1);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item1, len);
     }
     bool operator< (const struc3 & a2) const
     {
           if(strcmp(item1, a2.item1) < 0) return true;
           if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
           return false;
     }
};

#endif

Open in new window

inside which, I have several items.

And how to adjust the following altogether?

// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include "..\..\include\struc3.h"   
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

using namespace std;
struc3 binrec;
int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long cnt_val = 0;
	std::ofstream files[80];
	std::ofstream file_w;
	std::set<struc3> records_w;
	std::ostringstream filename_w;
	filename_w << "c:\\dp4b\\flout_w.bin";
	std::string strfilename_w = filename_w.str();
	file_w.open(strfilename_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;
	for (int f = 0; f < 80; ++f)
	{
		std::set<struc3> records;
		srand((int)time(NULL));
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		std::string strfilename = filename.str();
		files[f].open(strfilename.c_str(), std::ios::binary | std::ios::out);
		if (!files[f].is_open()) return errno;
		for (cnt = 0; cnt<1000000; cnt++)
		{
			cnt_val++;
			try
			{
				struc3 val = { 0 };
				int j;
				for (j = 0; j<20; j++)
				{
					val.item1[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i1_int = cnt_val;
				strcpy(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);

				for (j = 0; j<20; j++)
				{
					val.item2[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i2_int = cnt_val;
				strcpy(val.i2_val, val.item2);
				strcat(val.i2_val, " ");
				strcat(val.i2_val, val.item2);
				strcat(val.i2_val, " ");
				strcat(val.i2_val, val.item2);

				for (j = 0; j<20; j++)
				{
					val.item3[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i3_int = cnt_val;
				strcpy(val.i3_val, val.item3);
				strcat(val.i3_val, " ");
				strcat(val.i3_val, val.item3);
				strcat(val.i3_val, " ");
				strcat(val.i3_val, val.item3);

				for (j = 0; j<20; j++)
				{
					val.item4[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i4_int = cnt_val;
				strcpy(val.i4_val, val.item4);
				strcat(val.i4_val, " ");
				strcat(val.i4_val, val.item4);
				strcat(val.i4_val, " ");
				strcat(val.i4_val, val.item4);

				records.insert(val);
				//records_w.insert(val);
				if ((cnt + 1) % 10000 == 0)
				{
					std::cout << val.i1_int << " | " << val.item1 << std::endl;
				}
			}
			catch (exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}
		for (std::set<struc3>::iterator it = records.begin(); it != records.end(); ++it)
		{
			try
			{
				files[f].write((char *)&(*it), sizeof(struc3));
				//file_w.write((char *)&(*it), sizeof(struc3));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}

		records.clear();
		files[f].close();  //
	}
	std::ifstream inputfiles[80];
	struc3 names[80] = { 0 };
	bool eof_reached[80] = { false };
	long long num = 80;   //
	for (int f = 0; f < 80; ++f)
	{
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		inputfiles[f].open(filename.str().c_str(), std::ios::binary | std::ios::in);
		if (!inputfiles[f].is_open())
			return -3; //
		if (!inputfiles[f].read((char*)&names[f], sizeof(struc3)))
			return -4; //
	}

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 80; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].item1 < name_min)
			{
				name_min = names[n].item1;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		file_w.write((char*)&names[n_min], sizeof(struc3));
		if (!inputfiles[n_min].read((char*)&names[n_min], sizeof(struc3)))
		{
			eof_reached[n_min] = true;
			inputfiles[n_min].close();
		}
		if (!file_w.write((char*)&names[n_min], sizeof(struc3)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].item1 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  item1 = " << names[n_min].item1 << "std::endl";
		}
	}
	file_w.close();
	std::cout << "the maximum int value is " << integer_limit << std::endl;
	return 0;
}

Open in new window

ozo

Maybe you can have subclasses that all inherit from the parent struct.

Peter Chan

ASKER

Can I have more details to the way? Thanks.

sarabande

you need a struct or class for each index.

struct index_name 
{
     char char item1[21];
     unsigned int recnum;
};

struct index_number 
{
     long long number;
     unsigned int recnum;
};


struct index_desc
{
      char description[100];
      unsigned int recnum;
};

Open in new window

instead of 3 structures (classes)you also could use one template class instead.

template <class T>
struct Index
{
     T t;
     unsigned int recnum;
};

typedef char Name[21];
typedef char Desc[100];

Index<Name> name_index;
Index<long long> number_index;
Index<Desc> desc_index;

Open in new window

you also could use std::pair for that purpose or use a base class for all index structures as ozo has suggested.

struct Index
{
      unsigned int recnum;
};

struct Name_Index : public Index
{
      char name[21];
};
// and so on

Open in new window

all these alternatives are equivalent for your purposes. there are a few technical advantages/disadvantages for any of the solutions. I would suggest to start with 3 structures in order to keep it simple. you easily could switch to a better class design if things were working. the method to create the index files could be a copy of the current mechanism based on the index structures where you create smaller files and merge them to one big file. you would do that for each index separately.

however, before you start with new index structures you should make clear what you want to achieve. actually, I can't see any sense for creating an index for a 64-bit integer value which has the same value as the record number it points to. obviously such an index is redundant and of no value. also the third member, a 100 byte string, rarely is a good candidate for an index file with fix-sized record length. 100 characters is a maximum size and most strings would have much less text such that your index file would contain mostly zeros what is a bad design. moreover, for an index you would need unique texts what rarely could guaranteed for such texts. in the database world index files were based on b-trees which allow different lengths of the keys and also could handle duplicate or empty keys.

to sum up, if your original structure would contain a 20 char unique name key as first member, an integer which is a sequential number from 1 to 80 million, and a "description" which is any free text of variable length up to 99 characters and could be empty or could contain duplicates, then you should create a data file which is sorted by the description, and have two index files, one for the names and one for the number (which could be an 'int' and not a 'long long'). the binary search for the description could be done directly at the data file as you do now.

Sara

Peter Chan

ASKER

Many thanks Sara.
Is there any example to relate one specific item name of the original struct, to one index in your codes above?

sarabande

my last comment would relate to structures

struct nameval
{
     char name[21];
     int    number;
     char desc[100];
     bool operator< (const nameval & nv) const
     {
           if(strcmp(desc, nv.desc) < 0) return true;
           if(strcmp(desc, nv.desc) > 0) return false;
           if(strcmp(name, nv.name) < 0) return true;
           if(strcmp(name, nv.name) > 0) return false;
           if (number < nv.number) return true;
           return false;
     }
};

struct name_index
{
     char name[21];
     unsigned int recnum;
     bool operator< (const name_index & ni) const
     {
           if(strcmp(name, ni.name) < 0) return true;
           // the next two statements can be omitted if the name is unique
           if(strcmp(name, ni.name) > 0) return false;
           if (recnum< ni.recnum) return true;
           return false;
     }
};

struct number_index
{
     int number;
     unsigned int recnum;
     bool operator< (const number_index & ni) const
     {
           if(number < ni.number) return true;
           // the next two statements can be omitted if the number is unique
           if(number > ni.number) return false;
           if (recnum< ni.recnum) return true;
           return false;
     }
};

Open in new window

the first structure contains all three items and will be sorted by the desc + name + number. because of that the data file could be used for a binary search on the 3rd item 'desc'. you could use a std::vector as container for the records since the "desc" is not an unique key. after 1 million records you would sort the vector by using the operator<. you also could use a std::set because the operator< uses name and number to make each record unique. it might be interesting to find out which container is faster. for any case write the records (of struct nameval) to a file. finally all data files will be merged to one huge file.

the other two structures describe the records for the name index and the number index. both should have unique keys. you can't build the index files for name and number parallel to creating the data records because the final record number will not be known before merge of the data file. so the index files must be created after merge by reading the data file sequentially and inserting both name_index and number_index to containers (either std::set or std::vector). after 1 million entries you would store the sorted containers to a file and clear the containers for the next chunk. finally the index files need to be merged into big files.

Sara

Peter Chan

ASKER

Many thanks Sara.

I think only writing the big file, I should have several file copies, due to different indexes, right?

How to refer to different struct, due to different items below

     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];
     ...

Open in new window

since in the original .h file, I now have several items shown in above?

Peter Chan

ASKER

Correction: I think only when writing the big file, I should have several file copies, due to different indexes, right?

sarabande

no, if you follow the design I described in my last post, you would have 3 "big" files:

1 data file sorted by combination description+name+number. its packed record size is 125 (unpacked 128) bytes.
1 index file sorted by name. its record size is 24 (20 for name + 4 for record number)
1 index file sorted by number. its record size is 8 (4 for number and 4 for record number).

though the last file is 16 times smaller than the data file, it is still too big to be fully stored in memory. if using a vector you would not get the contiguous memory storage needed. and if using a set, the number of pointers used would lead to severe swapping such that your program would last hours if not days to creating the file.

because of that you need to create smaller files first which could be sorted in memory and merge them to one big file what doesn't much memory.

Sara

Peter Chan

ASKER

Many thanks Sara.

My current struct is having 4 items inside and I did create small-size files and big file to include all 4 items. I know we can have specific struct for the expected order we want to have, to one specific item? or i?_int. I think I have to create different big file for different index I expect to have. How to adjust the following, to create the big file, per new struct of the expected item?/i?_int ordering?

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 80; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].item1 < name_min)
			{
				name_min = names[n].item1;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		file_w.write((char*)&names[n_min], sizeof(struc3));
		if (!inputfiles[n_min].read((char*)&names[n_min], sizeof(struc3)))
		{
			eof_reached[n_min] = true;
			inputfiles[n_min].close();
		}
		if (!file_w.write((char*)&names[n_min], sizeof(struc3)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].item1 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  item1 = " << names[n_min].item1 << "std::endl";
		}
	}
	file_w.close();
	...

Open in new window

Peter Chan

ASKER

Here is one sample small-size file, that is having all 4 items inside.
https://dl.dropboxusercontent.com/u/40211031/flout0.zip

Peter Chan

ASKER

Per my understanding to have different structs as you've advised, we need to have record number to all struct, while within my original struct, the files would be sorted by record number, and each other different big file, which is ordered by one item? or i?_val, would be further searched per given item values, either item? or i?_val, and we would finally use the captured record number, to search against the original big file, to catch only one record. Is this correct?

sarabande

Here is one sample small-size file, that is having all 4 items inside

I can't open a binary zip file until tomorrow. I don't actually know what you mean by 4 'items'. do you mean 4 records? or do you mean 4 members?

to have different structs as you've advised, we need to have record number to all struct

what do you mean by that?

my suggestion uses an original structure with 3 members: a name of 21 characters, a 32-bit number, a description of 100 characters. so the main data file has a record size of 125 bytes (or 128 bytes if aligned to 32-bit boundary). it doesn't need a member for record number because the record number is known when reading the structure from file. if the file is sorted by the 100-character text value (+ name as second sort criteria if texts are not unique) you could do a binary search for the text with the method you now used in readbinaryfile. if the texts are not unique such a search may have more than one match. the binary search would find one of a group of identical texts and you would have to search from the record position in both directions to get the whole result set. but, since the file is sorted by text, all records with same text definitively build a contiguous block within the data file.

for the index files, you would have a different design. an index record structure is a pair of key+recordnumber. the record numbers point to the data file.

here a sample with 5 records:

data file 5 records. it is ordered by 3rd column text.

name number text

whatever    33            Axy zjk...
anykey        100          Dabc G..
some           29            Gxxx ku...
name           531         Maaa bJ...
other           41            Szavh...

Open in new window

name index file has also 5 records. it contains name column as key and points to record in data file. it is sorted by name and you can do a binary search if you lookup for a key. for example if you look for 'some' you would first check the middle key 'other' and find out that 'some' is greater 'other'. because of that you read the middle of the second half and read 'some' which is the match.

name recnum

anykey         2
name           4
other           5 
some           3
whatever    1

Open in new window

number index file has also 5 records. it contains number column as key and points to record in data file. it is sorted by number and you can do a binary search if you lookup for a given number. for example if you look for 35 you would first check the middle key 41 and find out that 35 is less than 41. because of that you read the middle of the first half and read 33 which is less than 35. the next 'middle' is 33 again, what means that 35 cannot be found.

number recnum

29                  3
33                  1
41                  5
100                2
531                4

Open in new window

to sum up, the original file is sorted by 3rd member. hence, you don't need an extra index file for this column. the two other columns will be stored separately in index files and point to the record numbers in the data file. you can do a binary search on those keys but have to read into the data file for a hit to get all data for the entry.

Sara

Peter Chan

ASKER

Many many thanks Sara.

Within my previous struct header file, I have

     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];
     ...

Open in new window

We actually use the big file to do search, right? And I have different items shown in above, and I expect to sort on one specific item only, upon the given need. It means we have to create different "big" file sorted on given item or item "long" integer (with name like i?_val), right? How to create the big file, being sorted on one specific item name or item integer?

sarabande

can you answer my question what you mean by item1, item2, item3, item4, .... ?

why do you have a struct which has multiple triples of members? why not using an array? and what has the new structure to do with the current namval, savebinaryfile, readbinaryfile sources?

How to create the big file, being sorted on one specific item name or item integer?

to build the three files you already have code for all the functionality needed.

data file:
use structure nameval as posted by me. it already has a valid operator< defined such that it would sort by 3rd text member.
- create 80 files with 1 million entries each by using a std::set to get the entries sorted per file(std::vector probably would need too many contiguous memory).
- merge all 80 files to one
- reopen the huge file and read it sequentially
- that way you can count the record number from 1 to 80 million
- use structure name_index and number_index as posted by me. both already have a valid operator< defined.
- use two std::set containers, one for each structure.
- for each index create 80 files with 1 million entries each by using the std::set to get the entries sorted by name respectively by number. as the structures for index are small you could create both the index files parallel.
- for each index merge all 80 files to one.
- for name index you have to use an array of 80 names to find the current minimum.
- for number index use an array of 80 integers.

finally you would have 3 big files where each of them could be used for a binary search. if you have a hit when using one of the index files you finally have to read the data file at record number which you got from the found index entry.

Sara

Peter Chan

ASKER

Many thanks Sara.

I need to have about 4 names and 4 numbers, to the original struct due to different purpose, to each record, and I expect to be able to search by one of them. This is why I showed you the struct having 4 items. Does it mean I should have 6 extra structs, like what you showed by name_index, and number_index?

sarabande

I need to have about 4 names and 4 numbers, to the original struct due to different purpose

ok. are the 4 names and 4 numbers unique? I mean do you have 320 million different names and 320 million different numbers for the 80 million records? or will you have empty names or zero numbers? what about the 100 char texts? do you have 4 such texts in a record or only one? if you have 4 names, 4 numbers, and 4 texts why not using 4 records instead? what is the reason that you want to put them into one record?

note, if you would create a huge file out of your structure, you also could read records from the file which contain only one item. that means if you make index files for names and numbers, the entries in those index files also could point to record numbers of records which contain only 1 name, number, and text.

if you would have (up to) 4 names, (up to) 4 numbers, and only 1 text, you should use the concept I described above with a little change. you would have a data file sorted by text what makes sense as there is only 1 text per record. hence you later can do a quick search on any text (or text begin) on the data file itself. then you would create only 1 index file for the names and 1 index file for the numbers. these files would get not 80 million entries but up to 320 million entries granted that you always have 4 names and 4 numbers per data record. but you easily could handle a dynamic number of names and numbers per data record as well since you build the index files by reading from the data records. so you are free to add 1, 2, 3 or 4 names to the name index, and 1, 2, 3, or 4 numbers to the number index. so some records of the data file would be referenced up to 8 times from the index files and others maybe only two times. note, the merge mechanism will also work if the files you were merging have a different number of entries. so regardless how many index entries you have collected in a set, you always could write it to a file and open a new file and clear the old set.

if you have 4 names, 4 numbers, and (up to) 4 texts you would not sort the data files but write them sequentially without using a set to the huge file. that means you also don't need to merge the data file. additionally, you can create the index files directly while writing the data file because the record number you need to store in the index entries is simply the loop counter from 1 to 80 million. note, you couldn't search for texts if you do so, beside you create an index file for the texts as well. if this could be a requirement you may think of creating a 3rd index file for texts where you don't store all text but for example non-trivial nouns or words. same as for name and number you may use an arbitrary count of strings from texts per data record. however, you should limit the "word length" for example to 20 characters such that the index file would not exceed the size of the data file.

Sara

Peter Chan

ASKER

Many thanks Sara.

OK, now 1st big file is ordered by item1 and i1_int, like before. And I want to create one other big file sorting on item2 and i2_int (this file will provide the way to search by item2 instead), and I have these codes

// struc3.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];

     int  get_len() const
     {
         int len = strlen(item1);
         if (len < (int)sizeof(item1))
             return len;
         return (int)sizeof(item1);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item1, len);
     }
     bool operator< (const struc3 & a2) const
     {
           if(strcmp(item1, a2.item1) < 0) return true;
           if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
           return false;
     }
};

#endif

// struc3c.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3c
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];

     int  get_len() const
     {
         int len = strlen(item2);
         if (len < (int)sizeof(item2))
             return len;
         return (int)sizeof(item2);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item2, len);
     }
     bool operator< (const struc3c & a2) const
     {
           if(strcmp(item2, a2.item2) < 0) return true;
           if(strcmp(item2, a2.item2) > 0) return false;
           if (i2_int < a2.i2_int) return true;
           return false;
     }
};

#endif

// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include "..\..\include\struc3.h"   
//#include "..\..\include\struc3b.h"   
#include "..\..\include\struc3c.h"   
//#include "..\..\include\struc3d.h"   
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

using namespace std;
struc3 binrec;
int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long cnt_val = 0;
	std::ofstream files[11];
	std::ofstream file_w;
	std::set<struc3> records_w;
	std::ostringstream filename_w;
	filename_w << "c:\\dp4b\\flout_w.bin";
	std::string strfilename_w = filename_w.str();
	file_w.open(strfilename_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;

	std::ofstream file_w3;
	std::set<struc3c> records_w3;
	std::ostringstream filename_w3;
	filename_w3 << "c:\\dp4b\\flout_w3.bin";
	std::string strfilename_w3 = filename_w3.str();
	file_w3.open(strfilename_w3.c_str(), std::ios::binary | std::ios::out);
	if (!file_w3.is_open()) return errno;
	...

Open in new window

how to correct these?

Error	4	error C2065: 'struc3c' : undeclared identifier	F:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	45	1	SaveBinaryFile
Error	5	error C2923: 'std::set' : 'struc3c' is not a valid template type argument for parameter '_Kty'	F:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	45	1	SaveBinaryFile
Error	6	error C2133: 'records_w3' : unknown size	F:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	45	1	SaveBinaryFile
Error	7	error C2512: 'std::set' : no appropriate default constructor available	F:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	45	1	SaveBinaryFile

Open in new window

sarabande

#ifndef NAME_VAL_H

because of that preprocessor statement the struc3c struct was not recognized by the compiler.

each headerfile needs its own macro to protect from being included twice. you better would derive the macro from file name to avoid such issues.

//struc3c.h
#ifndef STRUC3C_H
...
#endif
// - eof -

Open in new window

Sara

sarabande

And I want to create one other big file sorting on item2 and i2_int

i wonder how long it takes to creating such huge files where three-fourths of each record is only wasted space.

your current record size is 516 bytes and you could go with 128 instead, still having 4 names and 4 numbers plus text.

one of your files is 40 gb in size. so if you want to support all 4 names as keys and all 4 numbers you need 320 gb.

if using one big file with record size 128 the file size is 10 gb. two index files with 320 million entries each would take 8 gb for name index and 2.5 gb for number index.

you have all the instruments for to build those files and contrary to your current approach it is less complex and doesn't require to exchange header files and get issues because of redundant code.

Sara

Peter Chan

ASKER

one of your files is 40 gb in size. so if you want to support all 4 names as keys and all 4 numbers you need 320 gb.

if using one big file with record size 128 the file size is 10 gb. two index files with 320 million entries each would take 8 gb for name index and 2.5 gb for number index.

you have all the instruments for to build those files and contrary to your current approach it is less complex and doesn't require to exchange header files and get issues because of redundant code.

Many thanks Sara.
What other ways should be adopted, save the space wasted?

Peter Chan

ASKER

Correction:

What other ways should be adopted, to save the space wasted?

And I don't know why the 2nd big file cannot be created, using these

// struc3.h
#ifndef NAME_VAL_H
#define NAME_VAL_H

struct struc3
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];

     int  get_len() const
     {
         int len = strlen(item1);
         if (len < (int)sizeof(item1))
             return len;
         return (int)sizeof(item1);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item1, len);
     }
     bool operator< (const struc3 & a2) const
     {
           if(strcmp(item1, a2.item1) < 0) return true;
           if(strcmp(item1, a2.item1) > 0) return false;
           if (i1_int < a2.i1_int) return true;
           return false;
     }
};

#endif

// struc3c.h
#ifndef STRUC3C_H
#define STRUC3C_H

struct struc3c
{
     char item1[21];
     long long    i1_int;
     char i1_val[100];

     char item2[21];
     long long    i2_int;
     char i2_val[100];

     char item3[21];
     long long    i3_int;
     char i3_val[100];

     char item4[21];
     long long    i4_int;
     char i4_val[100];

     int  get_len() const
     {
         int len = strlen(item2);
         if (len < (int)sizeof(item2))
             return len;
         return (int)sizeof(item2);
     }
     void get_uni_nm(wchar_t nm_uni[], int sizfld) const
     {
         int len = get_len();
         if (len > sizfld)
             len = sizfld;
         mbstowcs(nm_uni, item2, len);
     }
     bool operator< (const struc3c & a2) const
     {
           if(strcmp(item2, a2.item2) < 0) return true;
           if(strcmp(item2, a2.item2) > 0) return false;
           if (i2_int < a2.i2_int) return true;
           return false;
     }
};

#endif

// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include "..\..\include\struc3.h"   
//#include "..\..\include\struc3b.h"   
#include "..\..\include\struc3c.h"   
//#include "..\..\include\struc3d.h"   
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

using namespace std;
struc3 binrec;
int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long cnt_val = 0;
	std::ofstream files[7];
	std::ofstream file_w;
	std::set<struc3> records_w;
	std::ostringstream filename_w;
	filename_w << "c:\\dp4b\\flout_w.bin";
	std::string strfilename_w = filename_w.str();
	file_w.open(strfilename_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;

	std::ofstream file_w3;
	std::set<struc3c> records_w3;
	std::ostringstream filename_w3;
	filename_w3 << "c:\\dp4b\\flout_w3.bin";
	std::string strfilename_w3 = filename_w3.str();
	file_w3.open(strfilename_w3.c_str(), std::ios::binary | std::ios::out);
	if (!file_w3.is_open()) return errno;

	for (int f = 0; f < 7; ++f)
	{
		std::set<struc3> records;
		srand((int)time(NULL));
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		std::string strfilename = filename.str();
		files[f].open(strfilename.c_str(), std::ios::binary | std::ios::out);
		if (!files[f].is_open()) return errno;
		for (cnt = 0; cnt<1000000; cnt++)
		{
			cnt_val++;
			try
			{
				struc3 val = { 0 };
				int j;
				for (j = 0; j<20; j++)
				{
					val.item1[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i1_int = cnt_val;
				strcpy(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);
				strcat(val.i1_val, " ");
				strcat(val.i1_val, val.item1);

				for (j = 0; j<20; j++)
				{
					val.item2[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i2_int = cnt_val;
				strcpy(val.i2_val, val.item2);
				strcat(val.i2_val, " ");
				strcat(val.i2_val, val.item2);
				strcat(val.i2_val, " ");
				strcat(val.i2_val, val.item2);

				for (j = 0; j<20; j++)
				{
					val.item3[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i3_int = cnt_val;
				strcpy(val.i3_val, val.item3);
				strcat(val.i3_val, " ");
				strcat(val.i3_val, val.item3);
				strcat(val.i3_val, " ");
				strcat(val.i3_val, val.item3);

				for (j = 0; j<20; j++)
				{
					val.item4[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.i4_int = cnt_val;
				strcpy(val.i4_val, val.item4);
				strcat(val.i4_val, " ");
				strcat(val.i4_val, val.item4);
				strcat(val.i4_val, " ");
				strcat(val.i4_val, val.item4);
				strcat(val.i4_val, " ");
				strcat(val.i4_val, "????");

				records.insert(val);
				//records_w.insert(val);
				if ((cnt + 1) % 10000 == 0)
				{
					std::cout << val.i1_int << " | " << val.item1 << std::endl;
				}
			}
			catch (exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}
		for (std::set<struc3>::iterator it = records.begin(); it != records.end(); ++it)
		{
			try
			{
				files[f].write((char *)&(*it), sizeof(struc3));
				//file_w.write((char *)&(*it), sizeof(struc3));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}

		records.clear();
		files[f].close();  //
	}
	std::ifstream inputfiles[7];
	struc3 names[7] = { 0 };
	bool eof_reached[7] = { false };
	long long num = 7;   //
	for (int f = 0; f < 7; ++f)
	{
		std::ostringstream filename;
		filename << "c:\\dp4b\\flout" << f << ".bin";
		inputfiles[f].open(filename.str().c_str(), std::ios::binary | std::ios::in);
		if (!inputfiles[f].is_open())
			return -3; //
		if (!inputfiles[f].read((char*)&names[f], sizeof(struc3)))
			return -4; //
	}

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 7; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].item1 < name_min)
			{
				name_min = names[n].item1;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		file_w.write((char*)&names[n_min], sizeof(struc3));
		if (!inputfiles[n_min].read((char*)&names[n_min], sizeof(struc3)))
		{
			eof_reached[n_min] = true;
			inputfiles[n_min].close();
		}
		if (!file_w.write((char*)&names[n_min], sizeof(struc3)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].item1 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  item1 = " << names[n_min].item1 << "std::endl";
		}
	}
	file_w.close();

	write_counter = 0;
	while (true)
	{
		std::string name_min3;
		long long n_min3 = -1;
		for (int n = 0; n < 7; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min3<0 || names[n].item2 < name_min3)
			{
				name_min3 = names[n].item2;
				n_min3 = n;
				continue;
			}
		}
		if (n_min3 < 0) break; // if all files were closed you are done
		file_w3.write((char*)&names[n_min3], sizeof(struc3c));
		if (!inputfiles[n_min3].read((char*)&names[n_min3], sizeof(struc3c)))
		{
			eof_reached[n_min3] = true;
			inputfiles[n_min3].close();
		}
		if (!file_w3.write((char*)&names[n_min3], sizeof(struc3c)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min3].item2 << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min3 = " << n_min3 << "  item2 = " << names[n_min3].item2 << "std::endl";
		}
	}
	file_w3.close();

	std::cout << "the maximum int value is " << integer_limit << std::endl;
	return 0;
}

Open in new window

sarabande

if you want to create 2 big files which were sorted differently you have to copy all parts of the code in main function not only the merge part. all the 1 million record files were sorted by item1. hence, it makes no sense to merge them by item2.

moreover, you didn't even close the files before your second merge loop, nor did you initialize the eof_reached flags. both mistakes prevent the second merge loop to read any new records from files and therefore your second file is empty. but even if you would solve these merge issues and reopen the smaller files and reset the eof flags, the total file would not be sorted by item2 because the smaller files are sorted by item1.

i strongly recommend to not going this way which is a dead-end. instead let us make classes and functions such that we could make the current functionality reusable and were able to creating a smart database with a couple of index files by using code we already have and only needed to be parameterized.

if you ignore my advise and nevertheless want to create a couple of huge files each of them sorted differently, you could make 4 copies of your program and do the appropriate changes in each of them. omit the header files and define the structures directly above the main function. that way you also easily could adopt operator< to your needs. and you wouldn't need more help as you already have one working copy ....

Sara

Peter Chan

ASKER

i strongly recommend to not going this way which is a dead-end. instead let us make classes and functions such that we could make the current functionality reusable and were able to creating a smart database with a couple of index files by using code we already have and only needed to be parameterized.

Many thanks Sara.

Using your given struct from your previous replies, should I then create both the small-size file and big file, to the relevant struct you showed?

sarabande

my recommendation is to make one big file not sorted at all and put all names and numbers to index files (the names of item1, item2, item3 and item4 would be in the same index file, same as the numbers of item?). each index entry would point to the original record number of the big file. when doing so, you don't need to create small files for the big data file but could create and write the big data file sequentiall with one loop. in the same loop you would create small files both for name index and number index. after 4 million entries (for all 4 items) you would write the files from their sorted set, close them and create a new small file where you start with empty sets for the next portion of small keys.

before i make a suggestion how you could build classes and functions out of your current code, i would need you to answer a question which i have asked before. what is the purpose of the i?_val text member? do you need one per record or one per item? your current code concatenates the name for each item a few times , what doesn't seem to be a meaningful use case.

if the texts are just arbitrary for the moment, i would suggest to use a little function which creates some kind of sentences for the texts, similar like you did it for the names:

int randindex(int maxrand, int off = 0)
{
   return (rand()%maxrand)+off;
}

std::string createItemText(int maxlen)
{
    static std::string vocals ="aeiouy";
    static std::string consonants = "bcdfghjklmnpqrstxz";
    static std::string prevocals[] = { "bl", "br", "ch", "cl", "cr", "dr", "dw", "fl", "fr", "gl", "gh", "gr", 
                                       "kl", "kn", "kr", "pl", "pr", "qu", "rh", "sch", "sh", "sp", "sl", "sk", 
                                       "st", "str", "spr", "th", "tr", "wh", "wr", };
    static std::string postvocals[] = { "ch", "nd", "rd", "nt", "rt", "rp", "rch", "sh", "wn", "rg", "rg","rl","th", "tt", "ss"};
    static int NV = vocals.length();                            
    static int NC = consonants.length();                            
    static int NPRE = sizeof(prevocals)/sizeof(prevocals[0]);
 
    static int NPOST = sizeof(postvocals)/sizeof(postvocals[0]);
    static int NW = 6;
    static int NL   = 12;
    static int NS   = 4;
    
    int nw = randindex(NW, 2);    // words of sentence
    std::string strspace;
    std::ostringstream os;
    while (--nw >= 0)
    {
           os << strspace;
           strspace = " ";
           int ns = randindex(NS, 1); // syllables
           bool bpre = (randindex(2) == 0);
           bool bpost = false;
           bool bvoc = false;
           while (--ns >= 0)
           {
                if (bpre == true)
                { 
                     bpre = false;
                     int np = randindex(NPRE);
                     os << prevocals[np];
                     if (ns == 0) ns = randindex(2, 1); // use at least 2 syllables
                     bvoc = true;
                }
                else if (bvoc == true)
                  { 
                     bvoc = false;                     
                     bpost = true;
                     int nv = randindex(NV);
                     if (ns == 0) ns = randindex(2); // at least 50 percent should not end with a vocal
                     os << vocals[nv];
                }
                else if (bpost == false || (randindex(2)==0))  // 50 percent use a single consonant after a vocal 
                {
                     bvoc = true;
                     int nc = randindex(NC);
                     if (bpost == false) ns += randindex(2, 1);  
                     os <<consonants[nc];
                }
                else 
                {
                     bpost = false;
                     int np = randindex(NPOST);
                     os << postvocals[np];
                     bvoc = true;
                 }
            }        
      }
      
      return os.str().substr(0, maxlen);
}

Open in new window

this function makes some funny sentences of arbitrary length using a language never heard before. you would call it like

std::string strval = createItemStruct(sizeof(i1_val)-1); 
strcpy_s(val.i1_val, sizeof(val.i1_val), strval.c_str());

Open in new window

wherever you want a long arbitrary text.

a similar weakness of your current approach is the i?_int member. this number currently is the same for all 4 items what apparently is a redundance of no value same as with the texts. furthermore it is identical to (record number - 1) if we would write the records without sorting them to the huge file. so, actually you simply could remove all i?_int members without any loss of information. to improve that you simply would create an arbitrary 64-bit number like that

long long createItemNumber()
{
    long long number1 = rand() + 12345;
    long long number2 = rand() + 98765;
    return (number1*(number2+1)*(number1+number2+77777));
}

Open in new window

note, it is a good chance that the above number is unique even for 320 million entries. the same applies for the names you were creating. but you only could make sure that they were unique by validating this when creating the merged files. when the minimum was determined you would have a chance to find duplicates (names or numbers) which then could be corrected by making a little increment to the current name or number.

What other ways should be adopted, to save the space wasted?

if you want to go the recommended way, we also will find a solution for this by storing the texts into another file with variable lengths. the big data file would not have 400 bytes wasted space per record but only "pointers" to the packed text file.

Sara

Peter Chan

ASKER

Many thanks Sara.
i?_val is one item value that is store some description of the item having length of about 100. Is there any details to demonstrate the way to create the relevant index file that is being sorted on i?_int or i?_val?

sarabande

the index files were based on struct index_name and struct index_number. both structures were derived from a base struct.

struct index_base
{
       unsigned int recnum;
};

struct index_name : public index_base
{
     char name[21];
     bool operator< (const index_name & ni) const
     {
           if (strcmp(name, ni.name) < 0) return true;
           return false;
     }
};

struct index_number : public index_base
{
     long long number;
     bool operator< (const index_number & ni) const
     {
           if (number < ni.number) return true;
           return false;
     }
};

Open in new window

your new save program would open the big file flout.dat. its records would be based on structure fouritems:

struct item
{
       char name[21];
       long long number;
       char description[100];
       bool operator< (const item & ni) const
       {
           if(strcmp(name, ni.name) < 0) return true;
           if(strcmp(name, ni.name) > 0) return false;
           if (number< ni.number) return true;
           return false;
      }
};

struct fouritems
{
       item items[4];
};

Open in new window

you don't need a std::set for fouritems struct as we would write the records unsorted to the file.

your new design would be

long long recnum = 0
create file for data file
      - use a function for creating the filename where you pass a prefix and an index
            - index == -1 means: don't add a suffix '_i' to the file name
for loop 80 counting f
     create small file for name index
          - use function for creating the filename passing "flname_idx" and counter f as arguments
     create small file for number index
          - use function for creating passing "flnum_idx" and counter f as arguments
     create set<index_name>
     create set<index_number>
     for loop 1 million counting n
           create record for 'fouritems' 
           for loop 4 counting i 
                 fill item i with random data.
                     - use a function to create name, number, and description of each item
                 create a index_name by using name and recnum
                 insert name of item to name set
                 create a index_number by using number and recnum
                 insert number of item to number set
           end for loop 4
           write 'fouritems' record to data file
           increment recnum
      end for loop 1 million
      for each index_name in set index_name 
             write index_name to small file index_name
      end for each
      close index_name file
      clear name_index set
      for each index_number in set number index 
             write index_number to small file index_number
      end for each
      close index_number file
      clear index_number set
end for loop 80
close big data file

Open in new window

note, you may not try to use your old code and make changes. instead try to make functions for any code part which was used more than once. copy the functionality from your old code, make a function out of it, and use only a call in your new main function.

if we got the new design compiled we finally would make an application class and turn the functions to member functions. we also would add functionality to have a separate data file for the texts such that the wasted space now could be avoided.

Sara

sarabande

note, the index_base was not used so far. it will get into use when we design the binary search and either search for names or numbers.

Sara

Peter Chan

ASKER

Many many thanks Sara.
Can I have more details to these?

     create set<index_name>
     create set<index_number>

Open in new window

sarabande

std::set<index_name> nameset;
std::set<index_number> numberset;

Open in new window

if you put these definitions into the f loop the sets will be cleared automatically at end of each loop circle.

Sara

Peter Chan

ASKER

Sorry, fouritems is now referring to 4 items, right? How about that sometimes I only want to sort on one specific item, how?

Peter Chan

ASKER

I mean to the big file.

sarabande

the big file is not sorted at all. with the new design you can search for all items regardless whether they were item1, item2, item3 or item4.

if you need a list of all records sorted - say - by item3 name only, you would read sequentially from name index file, then read the record from big file at the record number where the index is pointing to. then check whether the name you have read from index equals the item3 name. if yes, print the record (or write it to file). if not, skip the record.

for example if you have records with the following names

"B" "C" D" "E"
"Z" "W" "V", "Y"
"M" "N" "A" "P"

Open in new window

your index file would be (be aware: record numbers are 0-based)

"A" -> 2
"B" -> 0
"C" -> 0
"D" -> 0
"E" -> 0
"M" -> 2
"N" -> 2
"O" -> 2
"V" -> 1
"W" ->1
"Y" -> 1
"Z" ->1

Open in new window

so if you read the index sequentially and take only the records where the index name is item3 name you get the records

2:  "M" "N" "A" "P"
0:   "B" "C" D" "E"
1:  "Z" "W" "V", "Y"

Open in new window

which are ordered by item 3.

Sara

sarabande

note, if sorted lists from big file per item is a requirement you may speed-up the creation of such lists by adding the item index to the index structure. that way you could decide from index record whether it belongs to the required index or not.

that also would help if you want to search for a name in the item 3 name column only. then the binary search algorithm would skip all names when reading from index file which are not item 3 names.

Sara

Peter Chan

ASKER

Many thanks Sara. should I put recnum to both index_name and index_number structs?

sarabande

if you derive from index_base as I suggested recnum "IS" a member both of index_name and index_number.

you could add member itemnum (values: 1,2,3,4) to index_base if you want to support an item specific search.

Sara

Peter Chan

ASKER

Sorry, can you please give me more details of the mechanism, to use the 2 indexes, to locate the item record? Thanks a lot

sarabande

did you already succeed in creating the datafile and the two index files?

the search mechanism is identical to that currently used in readbinaryfile with two exceptions.

First, you would not open the datafile to do the binary search but one of the two (merged) index files, depending on whether you want to search for name or number. you also would calculate nbegin, nend, nmid by using the st_size member of the __stat64 structure where you called _stat64 function for the index file (and not of the data file).

Second, if have found an index record that matches, you would open the data file and read the items record (use struct fouritems fro to read). you would need to calculate the file position by multiplying the recnum member of the index structure with the size of the items records which is sizeof(fouritems). then, use seekg function to position within the data file before reading.

then you can print members of all 4 items of the record as result of your query. one of the 4 items of the record matches with the search criteria.

Sara

Peter Chan

ASKER

Sorry Sara.

To save space, we can skip the step to create small files for both index_name and Index_number, but to only have the big files for both, right?
Probably we only need to keep small files for items only, right?

SOLUTION

sarabande

membership

This solution is only available to members.

To access this solution, you must be a member of Experts Exchange.

Start Free Trial

Peter Chan

ASKER

Many thanks Sara.

Error	4	error C2533: 'fouritems::{ctor}' : constructors not allowed a return type	Z:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	95	1	SaveBinaryFile
Error	5	error C2065: 'record' : undeclared identifier	Z:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	101	1	SaveBinaryFile
Error	6	error C2228: left of '.items' must have class/struct/union	Z:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	101	1	SaveBinaryFile
Error	7	error C2228: left of '.name' must have class/struct/union	Z:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	101	1	SaveBinaryFile
Error	8	error C2065: 'record' : undeclared identifier	Z:\SaveBinaryFile\SaveBinaryFile\SaveBinaryFile.cpp	102	1	SaveBinaryFile

Open in new window

to the last 3 lines below

// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

struct index_base
{
	unsigned int recnum;
};

struct index_name : public index_base
{
	char name[21];
	bool operator< (const index_name & ni) const
	{
		if (strcmp(name, ni.name) < 0) return true;
		return false;
	}
};

struct index_number : public index_base
{
	long long number;
	bool operator< (const index_number & ni) const
	{
		if (number < ni.number) return true;
		return false;
	}
}; 

struct item
{
	char name[21];
	long long number;
	char description[100];
	int  get_len() const
	{
		int len = strlen(name);
		if (len < (int)sizeof(name))
			return len;
		return (int)sizeof(name);
	}
	void get_uni_nm(wchar_t nm_uni[], int sizfld) const
	{
		int len = get_len();
		if (len > sizfld)
			len = sizfld;
		mbstowcs(nm_uni, name, len);
	}
	bool operator< (const item & ni) const
	{
		if (strcmp(name, ni.name) < 0) return true;
		if (strcmp(name, ni.name) > 0) return false;
		if (number< ni.number) return true;
		return false;
	}
};

using namespace std;
item binrec;

void createItemName(char name[], int sizname)
{
	for (int j = 0; j<sizname - 1; j++)
	{
		name[j] = (char)(rand() % 26 + ((rand() % 2) ? 65 : 920));
	}
	name[sizname - 1] = '\0';
}

struct fouritems
{
	item items[4];
	fouritems(bool bfill = false);

};

void fouritems::fouritems(bool bfill)
{
	memset(this, 0, sizeof(fouritems));
	if (bfill == true)
	{
		for (int j = 0; j<4; ++j)
		{
			createItemName(record.items[j].name, 21);
			record.items[j].number = createItemNumber();
			strcpy_s(record.items[j].description, 99, createItemText(99).c_str());
		}
	}
}

Open in new window

while I did some other correction to the project.

sarabande

constructors not allowed a return type

remove the 'void' before constructor function

the other error are following errors of the first one.

Sara

sarabande

by the way: I wonder why you still have a global variable 'binrec' defined. you easily should see that 'binrec' is no longer used since months.

any code which is no longer used is ballast and should be removed. the same applies for functions never called like the get_uni or get_len which would make sense if you would handle wide strings as well or have keys of different length. but you don't have neither the one nor the other.

for your information: if one uses a struct rather than a class, they normally do so, because they want to access members from outside without using member functions. in a class you normally would have private or protected members and "setters" and "getters" member functions for accessing them outside of the class.

however, this is only convention to do so, and actually there is no difference (in c++) between struct and class beside that for the struct the members default to be public if not defined differently while in a class they default to be private.

our structures are pure data structures what means they could be used to exchange data records from file to structure and reverse. so we don't want to make them complex objects and use struct rather than class. nevertheless, c++ functionality like constructors, operator< make sense also for struct and can help to achieve a better design.

Sara

Peter Chan

ASKER

Many thanks Sara.
I don't know which record you're referring to, on last 3rd line below?

fouritems::fouritems(bool bfill)
{
	memset(this, 0, sizeof(fouritems));
	if (bfill == true)
	{
		for (int j = 0; j<4; ++j)
		{
			createItemName(record.items[j].name, 21);
			record.items[j].number = std::rand() % 10000000;
			strcpy_s(record.items[j].description, 99, createItemText(99).c_str());
		}
	}
}

Open in new window

Peter Chan

ASKER

what should be "record" to this

record.items[j].name

Open in new window

Thanks a lot and appreciate a lot to your help!

SOLUTION

sarabande

membership

This solution is only available to members.

To access this solution, you must be a member of Experts Exchange.

Start Free Trial

Peter Chan

ASKER

Sorry, how to adjust the following

		for (cnt = 0; cnt<1000000; cnt++)
		{
			cnt_val++;
			try
			{
				item val = { 0 };
				int j;
				for (j = 0; j<20; j++)
				{
					val.name[j] += (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
				}
				val.number = cnt_val;
				strcpy(val.description, val.name);
				strcat(val.description, " ");
				strcat(val.description, val.name);
				strcat(val.description, " ");
				strcat(val.description, val.name);

				//strcat(val.i4_val, "????");

				records.insert(val);
				...

Open in new window

to use createItemName, and fouritems event instead?

SOLUTION

sarabande

membership

This solution is only available to members.

To access this solution, you must be a member of Experts Exchange.

Start Free Trial

Peter Chan

ASKER

Many many thanks Sara.
I've already created the small files for both name and number. what should be adjusted below

		std::ostringstream flname;
		flname << "c:\\dp4b\\flname" << f << ".idx";
		std::string strflname = flname.str();
		filesname[f].open(strflname.c_str(), std::ios::binary | std::ios::out);
		if (!filesname[f].is_open()) return errno;
		...
	std::ifstream inputfilesnam[20];
	item names[20] = { 0 };
	bool eof_reached[20] = { false };
	long long num = 20;   //
	for (int f = 0; f < 20; ++f)
	{
		std::ostringstream flname;
		flname << "c:\\dp4b\\flname" << f << ".idx";
		inputfilesnam[f].open(flname.str().c_str(), std::ios::binary | std::ios::in);
		if (!inputfilesnam[f].is_open())
			return -3; //S
		if (!inputfilesnam[f].read((char*)&names[f], sizeof(item)))
			return -4; //
	}

	long long write_counter;
	write_counter = 0;
	while (true)
	{
		std::string name_min;
		long long n_min = -1;
		for (int n = 0; n < 20; ++n)
		{
			if (eof_reached[n] == true) continue;
			if (n_min<0 || names[n].name < name_min)
			{
				name_min = names[n].name;
				n_min = n;
				continue;
			}
		}
		if (n_min < 0) break; // if all files were closed you are done
		filesoutnam.write((char*)&names[n_min], sizeof(index_name));
		if (!inputfilesnam[n_min].read((char*)&names[n_min], sizeof(index_name)))
		{
			eof_reached[n_min] = true;
			inputfilesnam[n_min].close();
		}
		if (!filesoutnam.write((char*)&names[n_min], sizeof(index_name)))
		{
			std::cout << "write error " << errno << " at "
				<< write_counter << " key= " << names[n_min].name << std::endl;
			return -5;
		}
		if ((++write_counter) % 10000 == 0)
		{
			std::cout << write_counter << " n_min = " << n_min << "  name = " << names[n_min].name << "std::endl";
		}
	}
	filesoutnam.close();

Open in new window

when I'm to create the big file to name?

Peter Chan

ASKER

I mean to create the big idx file to name, in above.

SOLUTION

sarabande

membership

This solution is only available to members.

To access this solution, you must be a member of Experts Exchange.

Start Free Trial

SOLUTION

sarabande

membership

This solution is only available to members.

To access this solution, you must be a member of Experts Exchange.

Start Free Trial

Peter Chan

ASKER

Thanks Sara.
I did remove the parts of "records_w" below

// 

#include "stdafx.h"
#include <set>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <streambuf>
#include <string>
#include <ctype.h>
#include <time.h>
#include <process.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include <iomanip>
#include <algorithm>
#include <sstream>
#include <errno.h>
#include <limits>

const unsigned int NUM_FILES = 20;
const unsigned int NUM_RECORDS = 1000000;

struct index_name
{
	long long recnum;
	char name[21];
	bool operator< (const index_name & ni) const
	{
		if (strcmp(name, ni.name) < 0) return true;
		return false;
	}
};

struct index_number
{
	long long recnum;
	long long number;
	bool operator< (const index_number & ni) const
	{
		if (number < ni.number) return true;
		return false;
	}
}; 

struct item
{
	char name[21];
	long long number;
	char description[100];
	int  get_len() const
	{
		int len = strlen(name);
		if (len < (int)sizeof(name))
			return len;
		return (int)sizeof(name);
	}
	void get_uni_nm(wchar_t nm_uni[], int sizfld) const
	{
		int len = get_len();
		if (len > sizfld)
			len = sizfld;
		mbstowcs(nm_uni, name, len);
	}
	bool operator< (const item & ni) const
	{
		if (strcmp(name, ni.name) < 0) return true;
		if (strcmp(name, ni.name) > 0) return false;
		if (number< ni.number) return true;
		return false;
	}
};

using namespace std;

void createItemName(char name[], int sizname)
{
	for (int j = 0; j<sizname - 1; j++)
	{
		name[j] = (char)(rand() % 26 + ((rand() % 2) ? 65 : 97));
	}
	name[sizname - 1] = '\0';
}

struct fouritems
{
	item items[4];
	fouritems(bool bfill = false);

};

fouritems::fouritems(bool bfill)
{
	memset(this, 0, sizeof(fouritems));
	char str1[100];
	if (bfill == true)
	{
		for (int j = 0; j<4; ++j)
		{
			createItemName(items[j].name, 21);
			items[j].number = std::rand() % NUM_RECORDS;
			strcpy(str1, items[j].name);
			strcat(str1, " ");
			strcat(str1, items[j].name);
			strcat(str1, " ");
			strcat(str1, items[j].name);
			strcpy_s(items[j].description, 99, str1);
		}
	}
}

template <class index>
int mergeFiles(std::ofstream & outputfile, std::ifstream inputfiles[])
{
	index indexarr[NUM_FILES] = { 0 };
	int   recordsize = sizeof(index);
	std::vector<bool>  eofreached(NUM_FILES, false);
	for (int i = 0; i < NUM_FILES; ++i)
	{
		if (!inputfiles[i].read((char*)&indexarr[i], recordsize))
		{
			// log error message
			return errno;    // empty file or read error     
		}
	}
	while (true)
	{
		int n_min = 0;
		for (int n = 1; n < NUM_FILES; ++n)
		{
			if (eofreached[n] == true)
				continue;
			if (eofreached[n_min] == true || indexarr[n] < indexarr[n_min])
			{
				n_min = n;
			}
		}
		if (eofreached[n_min] == true)
			break;
		outputfile.write((char*)&indexarr[n_min], recordsize);
		if (!inputfiles[n_min].read((char*)(&indexarr[n_min]), recordsize))
		{
			inputfiles[n_min].close();
			eofreached[n_min] = true;
		}
	}
	outputfile.close();
	return 0;
}

int main()
{
	long long integer_limit = std::numeric_limits<std::streamsize>::max();

	long long temp = std::numeric_limits<std::streamsize>::max();
	long long cnt;
	long long recnum = 0;
	long long cnt_val = 0;
	std::ofstream files[NUM_FILES];
	std::ofstream filesname[NUM_FILES];
	std::ofstream filesnumb[NUM_FILES];
	std::ofstream flname;
	std::ofstream flnumb;
	std::ofstream file_w;
	//std::set<item> records_w;
	std::ostringstream flout_w;
	flout_w << "c:\\dp4b\\flout_w.dat";
	std::string strflout_w = flout_w.str();
	file_w.open(strflout_w.c_str(), std::ios::binary | std::ios::out);
	if (!file_w.is_open()) return errno;

	//std::set<item> records_w3;
	std::set<index_name> nameset;
	std::set<index_number> numbset;

	for (int f = 0; f < NUM_FILES; ++f)
	{
		std::set<item> records;
		srand((int)time(NULL));

		std::ostringstream flname;
		flname << "c:\\dp4b\\flname" << f << ".idx";
		std::string strflname = flname.str();
		filesname[f].open(strflname.c_str(), std::ios::binary | std::ios::out);
		if (!filesname[f].is_open()) return errno;

		std::ostringstream flnumb;
		flnumb << "c:\\dp4b\\flnumb" << f << ".idx";
		std::string strflnumb = flnumb.str();
		filesnumb[f].open(strflnumb.c_str(), std::ios::binary | std::ios::out);
		if (!filesnumb[f].is_open()) return errno;

		for (cnt = 0; cnt<NUM_RECORDS; cnt++)
		{
			recnum++;
			fouritems record(true);
			try
			{
				file_w.write((char *)&record, sizeof(record));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}

			for (int j = 0; j < 4; j++)
			{
				index_name idxname;
				strcpy_s(idxname.name, 21, record.items[j].name);
				idxname.recnum = recnum;
				nameset.insert(idxname);

				index_number idxnumb;
				idxnumb.number = record.items[j].number;
				numbset.insert(idxnumb);

			}

		}
		for (std::set<index_name>::iterator iname = nameset.begin(); iname != nameset.end(); ++iname)
		{
			try
			{
				index_name nam = *iname;
				filesname[f].write((char *)&nam, sizeof(nam));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}
		for (std::set<index_number>::iterator inumb = numbset.begin(); inumb != numbset.end(); ++inumb)
		{
			try
			{
				index_number num = *inumb;
				filesnumb[f].write((char *)&num, sizeof(num));
			}
			catch (std::exception& e)
			{
				std::cout << e.what() << '\n';
			}
		}

		filesname[f].close();
		filesnumb[f].close();
	}

	// here we have 1 datafile and NUM_FILES index files for each index
	// we then would merge the index files for names to a big name index file
	// ... and merge the NUM_FILES number index files to a big number index file as well
	std::ifstream namefiles[NUM_FILES];
	std::ifstream numberfiles[NUM_FILES];
	for (int i = 0; i < NUM_FILES; i++)
	{
		//std::string strnamefile = createFilename("c:\\dp4\\flname", ".idx", i + 1);
		std::ostringstream flname2;
		flname2 << "c:\\dp4b\\flname" << i << ".idx";
		std::string strnamefile = flname2.str();
		namefiles[i].open(strnamefile.c_str(), std::ios::binary | std::ios::in);

		std::ostringstream flnumb2;
		flnumb2 << "c:\\dp4b\\flnumb" << i << ".idx";
		std::string strnumberfile = flnumb2.str();
		numberfiles[i].open(strnumberfile.c_str(), std::ios::binary | std::ios::in);
	}

	std::ostringstream flname_w;
	flname_w << "c:\\dp4b\\flname_w.idx";
	std::string strnameindexfile = flname_w.str();
	std::ofstream nameindexfile(strnameindexfile.c_str(), std::ios::binary | std::ios::out);

	std::ostringstream flnumb_w;
	flnumb_w << "c:\\dp4b\\flnumb_w.idx";
	std::string strnumberindexfile = flnumb_w.str();
	std::ofstream numberindexfile(strnumberindexfile.c_str(), std::ios::binary | std::ios::out);

	// here we call the mergeFiles template for name_index and number_index
	int ret;
	if ((ret = mergeFiles<index_name>(nameindexfile, namefiles)) != 0)
	{
		// log error 
		return ret;
	}
	if ((ret = mergeFiles<index_number>(numberindexfile, numberfiles)) != 0)
	{
		// log error 
		return ret;
	}
	file_w.close();

	return 0;
}

Open in new window

but the files created are still in increasing size like what I mentioned yesterday. why?

ASKER CERTIFIED SOLUTION

sarabande

membership

This solution is only available to members.

To access this solution, you must be a member of Experts Exchange.

Start Free Trial

Peter Chan

ASKER

Thanks a lot.
I run your current codes. But it does slowly create the relevant files. why?

sarabande

const unsigned int NUM_RECORDS = 10000000;

i made a mistake and used 10 million instead of 1 million

it creates 4 names and 4 number indices per record what is 800 million index records.

you may change to NUM_FILES = 100 and NUM_RECORDS = 500000 what should speed-up the file creation.

Sara

Peter Chan

ASKER

Sara,
I really appreciate you a lot.

I increase the number of files to 80 and the process does spend a few hours to finish generating all files and I feel that it is a little bit slower than the old process which is not to search by several name/number. Do you think the speed is acceptable?

sarabande

80 files and 1 million records means that your program writes 80 million items records, and 2 x 320 million = 640 million name index records (both small files and big file) and 640 million number index records.

that is 41 GB for data file, 18 GB for name index files and 10 GB for number index files, what is about 70 GB in total.

if you currently need 4 hours (what is about 14,000 seconds) it is 5 MB/sec or 40 mbit/s what seems not so bad and probably could be improved by factor 5 - 10 when using an ssd drive. if you would use a normal disk which is fairly empty and not the same disk where the operation system and your program is running on, you also should/could improve the figures by factor 2 at least.

programmatically you could write the files in bigger chunks and not record by record. for example you could create an array of 1000 fouritems records and fill that array rather than writing each record to the data file. then if the array was full you write 1000 x sizeof(fouritems) bytes to file with one call. by doing so, you have 80 thousand writes instead of 80 million what should be much faster. same thing could be done for the index files (you may use the same array for both the small files and the big file).

last thing you could do is to write the description texts packed to a further text data file. if we assume that the middle length of the description is only a third of the maximum size, you would spare at least the half or the data file's size and have (only) 20 GB for both the data file and the text data file what is about 30 percent less of total size.

Sara

Peter Chan

ASKER

Thanks a lot Sara.

I see flname file is 8.750 GB below
https://dl.dropboxusercontent.com/u/40211031/tt5.png

while flnumber file is 133,440 KB
https://dl.dropboxusercontent.com/u/40211031/tt4.png

do you think there can be anything wrong with flnumber file?

Peter Chan

ASKER

Sorry, please omit the 2nd screenshot, as real size of flnumber should be about 509 MB and I'll upload the file for you to see it, if possible.

Peter Chan

ASKER

Here is flnumber big file
https://dl.dropboxusercontent.com/u/40211031/flnumber.idx

sarabande

do you think there can be anything wrong with flnumber file?

yes. I assume there are too many duplicate numbers created which would reduce the total number of numbers in the std::set. if that is the case the small files would have different size and don't contain 4 million numbers but much less. you also may output the size() of the std::set before they were written to file. the goal should be that it was 4 million.

do you use the createItemNumber function I posted? if no, then the issue is clear because the number you created with your algorithm creates a lot of duplicates. if yes, you should add some more summands to the expression in the createItemNumber function multiplied by (rand()+x) where x is some odd offset number such that the numbers it created are more likely to be unique as they are now. another way is to check the number before inserting it to the set like

index_number new_index = { 0 };
new_index.recnum = recnum;

do
{
    new_index.number = createItemNumber();
} 
while (number_set.find(new_index) != number_set.end());

number_set.insert(new_index);

Open in new window

the above would guarantee that each number is unique. but it is dangerous: if the createItemNumber would not return at least 1 million different numbers the above loop is infinite.

Sara

p. s. I can't look into dropbox before tomorrow.

sarabande

long long createItemNumber()
{
    // the maximum signed 64 bit integer is 9223372036854775808 and has 19 digits

    // in order to not create an overflow (what gives a negative number) we make the highest digit between 0 and 8
    long long number = rand()%9;

    // the loop creates a decimal number which has between 14-19 digits.
    for (int n = 0; n < (13+(rand()%6)); ++n)
           number = (number*10) + (rand()%10);
    return number;
}

Open in new window

Sara