jakac
asked on
Calculate distinct ID's and occurences using C / C++
I am dealing with big log files with about 50,000,000 rows per day and I have to calculate a number of distinct ID's over one single day and over the whole month (from multiple log files). Log file names are named by date e.g.:
2007-07-22.log
2007-07-23.log
2007-07-24.log
2007-07-25.log etc.
Right now I am using Perl script to do this and it uses also some system calls (sort, uniq, wc)
Because I know that C / C++ is much faster than Perl I am asking all C programmers out there if anyone has any solution for this? If there is any other even faster solution on Linux for this kind of problem I am also interested.
I need two things:
1.) a program that reads log filename(s) as command line argument(s) (there can be multiple files because I also need to calculate this count for the whole month), parses them and then prints out a total count of distinct ID's as just one integer.
Log files are plain text, tab separated, unordered and the ID is in the 6th column of every row
e.g.:
date<tab>time<tab>some<tab >other<tab >stuff<tab >ID<tab>.. .....more stuff\n
2.) I also need a program which would print out a total number of ID's which occure once, twice, three times...etc. This can be the same program if that would make it faster, no problem at all!
sample output:
no.of ID's | no. of hits
-------------------------- ---------- -
138491 | 1
3890 | 2
834 | 3
524 | 4
.....etc.
(this means that 138491 distinct ID's are logged only once and 3890 distinct ID's are logged twice etc.)
Right now the number of distinct ID's per day is about 20,000,000 and about 80,000,000 per month so I am dealing with a big data set and can not figure out how to make a program or a script which would calculate all this very fast. Perl script takes a few hours and also dramatically increases load so the computer is not usable during that time.
I would be also interested in any distributed solution (to work over 2 or more computers on LAN in order to calculate as fast as possible) if anybody has that kind of skills and I am also willing to give extra points for that kind of solution.
2007-07-22.log
2007-07-23.log
2007-07-24.log
2007-07-25.log etc.
Right now I am using Perl script to do this and it uses also some system calls (sort, uniq, wc)
Because I know that C / C++ is much faster than Perl I am asking all C programmers out there if anyone has any solution for this? If there is any other even faster solution on Linux for this kind of problem I am also interested.
I need two things:
1.) a program that reads log filename(s) as command line argument(s) (there can be multiple files because I also need to calculate this count for the whole month), parses them and then prints out a total count of distinct ID's as just one integer.
Log files are plain text, tab separated, unordered and the ID is in the 6th column of every row
e.g.:
date<tab>time<tab>some<tab
2.) I also need a program which would print out a total number of ID's which occure once, twice, three times...etc. This can be the same program if that would make it faster, no problem at all!
sample output:
no.of ID's | no. of hits
--------------------------
138491 | 1
3890 | 2
834 | 3
524 | 4
.....etc.
(this means that 138491 distinct ID's are logged only once and 3890 distinct ID's are logged twice etc.)
Right now the number of distinct ID's per day is about 20,000,000 and about 80,000,000 per month so I am dealing with a big data set and can not figure out how to make a program or a script which would calculate all this very fast. Perl script takes a few hours and also dramatically increases load so the computer is not usable during that time.
I would be also interested in any distributed solution (to work over 2 or more computers on LAN in order to calculate as fast as possible) if anybody has that kind of skills and I am also willing to give extra points for that kind of solution.
Lil' correction, 'main()' should be
int main () {
list<string> lstFiles;
map<int,int> mapIdsToHits;
list_all_logfiles ( "/home/somedir", lstFiles);
list<string>::const_iterat or i;
for (i = lstFile.begin(); i != lstFiles.end() ++i) {
ProcessFile(*i, mapIdsToHits);
}
DisplayResults(mapIdsToHit s);
return 0;
}
int main () {
list<string> lstFiles;
map<int,int> mapIdsToHits;
list_all_logfiles ( "/home/somedir", lstFiles);
list<string>::const_iterat
for (i = lstFile.begin(); i != lstFiles.end() ++i) {
ProcessFile(*i, mapIdsToHits);
}
DisplayResults(mapIdsToHit
return 0;
}
OK, a lot of typos in the above - the following compiles
#include <sys/types.h>
#include <dirent.h>
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <map>
#include <list>
#include <sstream>
using namespace std;
void list_all_logfiles ( const string& sStartDir, list<string>& lstFound) {
cout << "checking " << sStartDir<< endl;
DIR* pDir = opendir ( sStartDir.c_str ());
if ( !pDir) return;
dirent* pEntry;
while ( pEntry = readdir ( pDir)) {
cout << "found " << pEntry->d_name << endl;
string sFound = pEntry->d_name;
if (string::npos != sFound.find(".log")) lstFound.push_back ( sFound);
}
closedir(pDir);
}
int split_text(string strIn, const char cDelim, vector<string>& vResult) {
int nPos = 0;
int nCount = 0;
int nFound;
string strToken;
while(1) {
nFound = strIn.find(cDelim,nPos);
if (-1 == nFound) {
strToken = strIn.substr(nPos,strIn.le ngth() - nPos);
vResult.push_back(strToken );
break;
}
strToken = strIn.substr(nPos,nFound - nPos);
nPos = nFound + 1;
++nCount;
vResult.push_back(strToken );
}
return nCount;
}
void ProcessFile(const string& strName, map<int,int>& rmapResults) {
ifstream is(strName.c_str());
if (!is.is_open()) {
cout << "Error processing " << strName << endl;
return;
}
while(!is.eof()) {
vector<string> vResult;
string strLine;
getline(is,strLine);
int n = split_text(strLine,'\t',vR esult);
int nId;
stringstream ss;
ss << vResult[5]; // get ID at col. 6 (indices are zero-based)
ss >> nId ;
map<int,int>::iterator i = rmapResults.find(nId);
if (i == rmapResults.end()) { // not found yet
int nCount = 1;
rmapResults.insert(map<int ,int>::val ue_type(nI d,nCount)) ;
} else { // already seen that ID
i->second++; // increment count
}
}
}
void DisplayResults(const map<int,int>& rmap) {
map<int,int>::const_iterat or i;
cout << "ID:\tCount:" << endl; // header
for (i = rmap.begin(); i != rmap.end(); ++i) {
cout << i->first << "\t" << i->second << endl;
}
}
int main () {
list<string> lstFiles;
map<int,int> mapIdsToHits;
list_all_logfiles ( "/home/somedir", lstFiles);
list<string>::const_iterat or i;
for (i = lstFiles.begin(); i != lstFiles.end(); ++i) {
ProcessFile(*i, mapIdsToHits);
}
DisplayResults(mapIdsToHit s);
return 0;
}
#include <sys/types.h>
#include <dirent.h>
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <map>
#include <list>
#include <sstream>
using namespace std;
void list_all_logfiles ( const string& sStartDir, list<string>& lstFound) {
cout << "checking " << sStartDir<< endl;
DIR* pDir = opendir ( sStartDir.c_str ());
if ( !pDir) return;
dirent* pEntry;
while ( pEntry = readdir ( pDir)) {
cout << "found " << pEntry->d_name << endl;
string sFound = pEntry->d_name;
if (string::npos != sFound.find(".log")) lstFound.push_back ( sFound);
}
closedir(pDir);
}
int split_text(string strIn, const char cDelim, vector<string>& vResult) {
int nPos = 0;
int nCount = 0;
int nFound;
string strToken;
while(1) {
nFound = strIn.find(cDelim,nPos);
if (-1 == nFound) {
strToken = strIn.substr(nPos,strIn.le
vResult.push_back(strToken
break;
}
strToken = strIn.substr(nPos,nFound - nPos);
nPos = nFound + 1;
++nCount;
vResult.push_back(strToken
}
return nCount;
}
void ProcessFile(const string& strName, map<int,int>& rmapResults) {
ifstream is(strName.c_str());
if (!is.is_open()) {
cout << "Error processing " << strName << endl;
return;
}
while(!is.eof()) {
vector<string> vResult;
string strLine;
getline(is,strLine);
int n = split_text(strLine,'\t',vR
int nId;
stringstream ss;
ss << vResult[5]; // get ID at col. 6 (indices are zero-based)
ss >> nId ;
map<int,int>::iterator i = rmapResults.find(nId);
if (i == rmapResults.end()) { // not found yet
int nCount = 1;
rmapResults.insert(map<int
} else { // already seen that ID
i->second++; // increment count
}
}
}
void DisplayResults(const map<int,int>& rmap) {
map<int,int>::const_iterat
cout << "ID:\tCount:" << endl; // header
for (i = rmap.begin(); i != rmap.end(); ++i) {
cout << i->first << "\t" << i->second << endl;
}
}
int main () {
list<string> lstFiles;
map<int,int> mapIdsToHits;
list_all_logfiles ( "/home/somedir", lstFiles);
list<string>::const_iterat
for (i = lstFiles.begin(); i != lstFiles.end(); ++i) {
ProcessFile(*i, mapIdsToHits);
}
DisplayResults(mapIdsToHit
return 0;
}
Oh, and just if that would be unclear - compile the above using
g++ -o genids genids.cpp
(any filename will do, that is just the one I chose)
g++ -o genids genids.cpp
(any filename will do, that is just the one I chose)
ASKER
thanx for this solution... I will try it asap and let you know!
ASKER
I tried it but it just produced the following output:
found .
found ..
found 2007-06-30.log
Error processing 2007-06-30.log
ID: Count:
Before I compiled it I also changed the line
list_all_logfiles ( "/home/somedir", lstFiles);
to my actual directory for testing which contains this 2007-06-30.log file but I guess the program doesn't open it or something? Can you please check it out? Thanx!
Sorry, the files aren't stored with their full path - make that
void list_all_logfiles ( const string& sStartDir, list<string>& lstFound) {
cout << "checking " << sStartDir<< endl;
DIR* pDir = opendir ( sStartDir.c_str ());
if ( !pDir) return;
dirent* pEntry;
while ( pEntry = readdir ( pDir)) {
cout << "found " << pEntry->d_name << endl;
string sFound = sStartDir + string ( "/") + string ( pEntry->d_name); // <---
if (string::npos != sFound.find(".log")) lstFound.push_back ( sFound);
}
closedir(pDir);
}
instead.
void list_all_logfiles ( const string& sStartDir, list<string>& lstFound) {
cout << "checking " << sStartDir<< endl;
DIR* pDir = opendir ( sStartDir.c_str ());
if ( !pDir) return;
dirent* pEntry;
while ( pEntry = readdir ( pDir)) {
cout << "found " << pEntry->d_name << endl;
string sFound = sStartDir + string ( "/") + string ( pEntry->d_name); // <---
if (string::npos != sFound.find(".log")) lstFound.push_back ( sFound);
}
closedir(pDir);
}
instead.
ASKER
Hello,
Well the program works now but after running about one minute it produces a Segmentation fault...
As I said - program has to be able to handle a massive amount of data - about 20,000,000 distinct ID's... Right now I only tested in stripped-down file which has only about 200,000 distinct ID's and it didn't go through... I also tried on a smaller file (only 1,000 rows) and it also produced a segmentation fault...
BTW: did I mention that ID's are 24 character long strings, not integers (if that may be the problem maybe)
Well the program works now but after running about one minute it produces a Segmentation fault...
As I said - program has to be able to handle a massive amount of data - about 20,000,000 distinct ID's... Right now I only tested in stripped-down file which has only about 200,000 distinct ID's and it didn't go through... I also tried on a smaller file (only 1,000 rows) and it also produced a segmentation fault...
BTW: did I mention that ID's are 24 character long strings, not integers (if that may be the problem maybe)
ASKER CERTIFIED SOLUTION
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Forced accept.
Computer101
EE Admin
Computer101
EE Admin
#include <sys/types.h>
#include <dirent.h>
#include <string>
#include <iostream>
#include <list>
using namespace std;
void list_all_logfiles ( const string& sStartDir, list<string>& lstFound) {
cout << "checking " << sStartDir.c_str () << endl;
DIR* pDir = opendir ( sStartDir.c_str ());
if ( !pDir) return false;
dirent* pEntry;
while ( pEntry = readdir ( pDir)) {
cout << "found " << pEntry->d_name << endl;
string sFound = pEntry->d_name;
if (string::npos != sFound.find(".log") lstFound.push_back ( sFound);
}
closedir(pDir);
}
and use it like
list<string> lstFiles;
list_all_logfiles ( "/home/somedir", lstFiles);
From there on, you can simply process the ID counting like
#include <sys/types.h>
#include <dirent.h>
#include <string>
#include <string>
#include <vector>
#include <iostream>
#include <map>
#include <sstream>
using namespace std;
void list_all_logfiles ( const string& sStartDir, list<string>& lstFound) {
cout << "checking " << sStartDir.c_str () << endl;
DIR* pDir = opendir ( sStartDir.c_str ());
if ( !pDir) return false;
dirent* pEntry;
while ( pEntry = readdir ( pDir)) {
cout << "found " << pEntry->d_name << endl;
string sFound = pEntry->d_name;
if (string::npos != sFound.find(".log") lstFound.push_back ( sFound);
}
closedir(pDir);
}
int split_text(string strIn, const char cDelim, vector<string>& vResult) {
int nPos = 0;
int nCount = 0;
int nFound;
string strToken;
while(1) {
nFound = strIn.find(cDelim,nPos);
if (-1 == nFound) {
strToken = strIn.substr(nPos,strIn.le
vResult.push_back(strToken
break;
}
strToken = strIn.substr(nPos,nFound - nPos);
nPos = nFound + 1;
++nCount;
vResult.push_back(strToken
}
return nCount;
}
void ProcessFile(const string& strName, map<int,int>& rmapResults) {
ifstream is(strName.c_str());
if (!is.is_open()) {
cout << "Error processing " << strName << endl;
return;
}
while(!is.eof()) {
vector<string> vResult;
string strLine;
getline(is.strLine);
int n = split_text(strLine,'\t',vR
int nId;
stringstream ss;
ss << vResult[5]; // get ID at col. 6 (indices are zero-based)
ss >> nId ;
map<int,int>::iterator i = rmapResults.find(nId);
if (i == rmapResults.end()) { // not found yet
int nCount = 1;
rmapResults.insert(map<int
} else { // already seen that ID
i->second++; // increment count
}
}
}
void DisplayResults(const map<int,int>& rmap) {
map<int,int>::const_iterat
cout << "ID:\tCount:" << endl; // header
for (i = rmap.begin(); i != rmap.end(); ++i) {
cout << i->first << "\t" << i->second << endl;
}
}
int main () {
list<string> lstFiles;
map<int,int> mapIdsToHits;
list_all_logfiles ( "/home/somedir", lstFiles);
list<string>::iterator i;
for (i = lstFile.begin(); i != lstFiles.end() ++i) {
ProcessFile(*i, mapIdsToHits);
}
DisplayResults(mapIdsToHit
return 0;
}