ettt6
asked on
searching between two seperate files
I would like to create a piece of code that would work with the rest of the program I have been developing. Now I would need it to search one file for a population and then all the matches of the population and subsequent data, then go to another file and search for the matching term and print out in a new file. Look at the exmple below
Search term
user enter zulu
file 1
zulu, subharan africa, a*3008, a*2301,B*3433, etc
zulu, subharan africa, a*1232, a*3q43,B*342342, etc
finn 90, europe, a*3434, a*34343,b*232
file 2
>a*3008
12333
>a*a2301
fdfdfd
>a*1232
asdfdfdf
>a*3q43
asfdfdsf
output file
Zulu
>a*3008
12333
>a*a2301
fdfdfd
>a*1232
asdfdfdf
>a*3q43
asfdfdsf
Search term
user enter zulu
file 1
zulu, subharan africa, a*3008, a*2301,B*3433, etc
zulu, subharan africa, a*1232, a*3q43,B*342342, etc
finn 90, europe, a*3434, a*34343,b*232
file 2
>a*3008
12333
>a*a2301
fdfdfd
>a*1232
asdfdfdf
>a*3q43
asfdfdsf
output file
Zulu
>a*3008
12333
>a*a2301
fdfdfd
>a*1232
asdfdfdf
>a*3q43
asfdfdsf
How does the output file differ from "Zulu\n" followed by file 2?
Not too hard, very roughly:
f1 = fopen( "file1", "r" );
f2 = fopen( "file2", "r" );
fgets( Line, sizeof(Line), file1);
p = strchr( ',', Line ); // look for first comma
p = strchr( ',', Line[p]); // look for second comma starting after first one
while( p > 0 ) { // while we have a comma
p2 = strchr( ',', Line[p] ); // find comma after keyqword
if(p2 == 0 ) p2 = 999; else Line[ p+p2 ] = '\0'; // zero end the keyword
strcpy( Key, Line[p] ); // make a copy of the keyword
fseek( f2, 0, FILE_BEGIN ); // rewind the second file
while( ! feof(f2) ) { // scan whole file
fgets( Line2, sizeof(Line2), f2 ); // read a line
if( Line2[0] == '>' )
p3 = strstr( Key, Line2 ); // look for key
else p3 = 0;
if( p3 > 0 ) { // if we find the key
do { // copy lines til next '>'
fgets( Line2, sizeof(Line2), f2 ) {
if Line2[0] != '>' ) printf( "%s", Line2 );
} while( Line2[0] ! = '>' && ! feof( f2 ) );
}
}
p = p2; Skip to after keyword
p = strchr( ',', Line[p]); // look for next comma
}
f1 = fopen( "file1", "r" );
f2 = fopen( "file2", "r" );
fgets( Line, sizeof(Line), file1);
p = strchr( ',', Line ); // look for first comma
p = strchr( ',', Line[p]); // look for second comma starting after first one
while( p > 0 ) { // while we have a comma
p2 = strchr( ',', Line[p] ); // find comma after keyqword
if(p2 == 0 ) p2 = 999; else Line[ p+p2 ] = '\0'; // zero end the keyword
strcpy( Key, Line[p] ); // make a copy of the keyword
fseek( f2, 0, FILE_BEGIN ); // rewind the second file
while( ! feof(f2) ) { // scan whole file
fgets( Line2, sizeof(Line2), f2 ); // read a line
if( Line2[0] == '>' )
p3 = strstr( Key, Line2 ); // look for key
else p3 = 0;
if( p3 > 0 ) { // if we find the key
do { // copy lines til next '>'
fgets( Line2, sizeof(Line2), f2 ) {
if Line2[0] != '>' ) printf( "%s", Line2 );
} while( Line2[0] ! = '>' && ! feof( f2 ) );
}
}
p = p2; Skip to after keyword
p = strchr( ',', Line[p]); // look for next comma
}
ozo is right, is
Zulu
>a*3008
12333
Zulu
>a*a2301
fdfdfd
Finn 90
>a*1232
asdfdfdf
what you mean? What's the corellation between the columns in file 1 and the data blocks in file 2?
Zulu
>a*3008
12333
Zulu
>a*a2301
fdfdfd
Finn 90
>a*1232
asdfdfdf
what you mean? What's the corellation between the columns in file 1 and the data blocks in file 2?
ASKER
The file differ in that its only one population per file, as opposed to many populations in one file
Ah - that means that the population you want to look for has to be known beforehand and you take all the fileds beginning with the 3rd and corellate them with the data in the 2nd?
ASKER
yea the 3,4,6,7,9,10,12,13,15,16,1 8,19
An, OK. So you could
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <map>
using namespace std;
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return false; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
string sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
void ExtractHaplotypesForPopula tion(const char* pszHaplotypeFileName, const string& sPop, list<string>& lstResult) {
static int anFields[] = { 3,4,-1, 6,7,9,10,12,13,15,16,18,19 , -1}; // TEST - remove the 1st -1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
cout << "RAW " << sLine << endl;
string sField = ExtractField (1,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i],sLine);
cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField );
}
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
strType = strLine.substr(1);
cout << " found " << strType << endl;
getline(is,strData);
cout << " adding " << strData << endl;;
mapProteins.insert(map<str ing,string >::value_t ype(strTyp e,strData) );
strData.erase(); // start out with a fresh buffer
}
}
}
void AssociateProteinsWithHaplo types(cons t char* pszFileName, string sPop, list<string>& lstTypes, map<string,string>& mapProteins) {
ofstream os (pszFileName);
string strLine;
string strData;
string strType;
if (!os.is_open()) {
cout << "Could not open output file" << endl;
return;
}
os << sPop << endl;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
map<string,string>::iterat or im = mapProteins.find(*il);
if ( mapProteins.end() != im) {
os << ">" << im->first << endl << im->second << endl;
}
}
}
int main () {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
cout << "Enter Population: " << endl;
cin >> sPop;
ExtractHaplotypesForPopula tion ( "pop.txt", sPop, lstTypes);
ReadProteins ( "haplo.txt", mapProteins);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
return 0;
}
With your sample above, it creates
zulu
>a*3008
12333
>a*1232
asdfdfdf
>a*3q43
asfdfdsf
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <map>
using namespace std;
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return false; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
string sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
void ExtractHaplotypesForPopula
static int anFields[] = { 3,4,-1, 6,7,9,10,12,13,15,16,18,19
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
cout << "RAW " << sLine << endl;
string sField = ExtractField (1,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i],sLine);
cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField
}
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
strType = strLine.substr(1);
cout << " found " << strType << endl;
getline(is,strData);
cout << " adding " << strData << endl;;
mapProteins.insert(map<str
strData.erase(); // start out with a fresh buffer
}
}
}
void AssociateProteinsWithHaplo
ofstream os (pszFileName);
string strLine;
string strData;
string strType;
if (!os.is_open()) {
cout << "Could not open output file" << endl;
return;
}
os << sPop << endl;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
map<string,string>::iterat
if ( mapProteins.end() != im) {
os << ">" << im->first << endl << im->second << endl;
}
}
}
int main () {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
cout << "Enter Population: " << endl;
cin >> sPop;
ExtractHaplotypesForPopula
ReadProteins ( "haplo.txt", mapProteins);
AssociateProteinsWithHaplo
return 0;
}
With your sample above, it creates
zulu
>a*3008
12333
>a*1232
asdfdfdf
>a*3q43
asfdfdsf
ASKER
Its not working right and I think becuase the population file is still that same mhc.csv file whilie the other is a text file
>> Its not working right
Could you be a little more precise?
Could you be a little more precise?
Oh, and BTW - you might want to consider providing a portion of the original file.
And, did you follow the instructions in
static int anFields[] = { 3,4,-1, 6,7,9,10,12,13,15,16,18,19 , -1}; // TEST - remove the 1st -1
?
And, did you follow the instructions in
static int anFields[] = { 3,4,-1, 6,7,9,10,12,13,15,16,18,19
?
ASKER
sorry for being so slack and getting back to you. the program would jsut crash each time. and the files are the same files we have been working with in previous questions.
>> the files are the same files we have been working with in previous questions.
Um, no - here, 'Zulu' is in the 1st field, 'mhc.csv' has it in the 3rd...
Um, no - here, 'Zulu' is in the 1st field, 'mhc.csv' has it in the 3rd...
I modified the program we had in a previous question. You'll get the result by searching for a population.
Regards, Alex
#pragma warning (disable : 4786 4503)
#include <map>
#include <vector>
#include <string>
#include <sstream>
#include <fstream>
#include <iostream>
#include <windows.h>
using namespace std;
typedef vector<string> Population;
typedef vector<Population*> PopulationSet;
class PopulationDatabase
{
string filename1;
string filename2;
PopulationSet allPopulation;
map<string, PopulationSet > mapPopulation;
map<string, PopulationSet > mapPopArea;
map<string, string > mapHaploTypes;
public:
PopulationDatabase(const string& file1, const string& file2 )
: filename1(file1), filename2(file2) {}
~PopulationDatabase();
int readFromFiles();
int parsePopulationRecord( const string& line, Population& array);
int searchPop(const string& pop, PopulationSet& rset);
int searchArea(const string& area, PopulationSet& rset);
void displayRec(const Population& rec);
void displaySet(const PopulationSet& rset);
void displayAll();
void writeOutputfile(Population Set& rset);
};
PopulationDatabase::~Popul ationDatab ase()
{
for (int i = 0; i < allPopulation.size(); ++i)
delete allPopulation[i];
}
int PopulationDatabase::parseP opulationR ecord( const string& line, Population& array)
{
istringstream iss(line);
string entry;
string text;
bool literal = false;
while (getline(iss, entry, ','))
{
if (literal)
{
if (entry.rfind('"') == entry.length()-1)
{
literal = false;
entry.resize(entry.length( )-1);
text += entry;
entry = text;
}
else
{
text += entry;
text += ',';
continue;
}
}
else if (entry.find('"') == 0)
{
if (entry.rfind('"') == entry.length()-1)
entry = entry.substr(1, entry.length()-2);
else
{
literal = true;
text = entry;
text += ',';
continue;
}
}
array.push_back(entry);
}
return array.size();
}
int PopulationDatabase::readFr omFiles()
{
ifstream ifs(filename1.c_str(), ios::binary | ios::in);
string line;
line.reserve(1024);
int count = 0;
char c;
while (ifs.read(&c, 1))
{
// check for end-of-line
if (c == '\r' || c == '\n')
{
if (line.empty())
continue;
Population* pPop = new Population;
pPop->reserve(30);
int n = parsePopulationRecord(line , *pPop);
if (n != 30)
{
cout << "Error: wrong number of commas in line " << count << endl;
continue; // ignore line
}
allPopulation.push_back(pP op);
PopulationSet& setP = mapPopulation[(*pPop)[2]]; // get set from map Population
// can be a new empty set
setP.push_back(pPop);
PopulationSet& setA = mapPopArea[(*pPop)[3]]; // get set from map PopArea
setA.push_back(pPop); // can be a new empty set
line = "";
}
else
line += c; // add char to line
}
ifs.close();
ifstream ifs2(filename2.c_str());
string key, data;
while (getline(ifs2, key) && getline(ifs2, data))
{
if (key.find('>') != 0)
continue;
mapHaploTypes[key] = data;
}
ifs2.close();
return allPopulation.size();
}
int PopulationDatabase::search Pop(const string& pop, PopulationSet& rset)
{
rset = mapPopulation[pop];
return rset.size();
}
int PopulationDatabase::search Area(const string& area, PopulationSet& rset)
{
rset = mapPopArea[area];
return rset.size();
}
void PopulationDatabase::displa yRec(const Population& rec)
{
for (int i = 0; i < rec.size(); ++i)
{
cout << rec[i];
if (i < rec.size()-1) cout << ',';
else cout << endl;
}
}
void PopulationDatabase::writeO utputfile( Population Set& rset)
{
static int haploTypeCols[] = { 4,5,7,8,10,11,13,14,16,17, 19,20 };
ofstream ofs("output.txt");
string pop;
for (int i = 0; i < rset.size(); ++i)
{
Population& rec = *rset[i];
if (rec[2] != pop)
{
pop = rec[2];
ofs << rec[2] << endl;
}
for (int j = 0; j < sizeof(haploTypeCols)/size of(int); ++j)
{
string key = ">";
key += rec[haploTypeCols[j]];
if (key == ">")
continue;
map<string, string>::iterator f = mapHaploTypes.find(key);
if (f == mapHaploTypes.end())
continue;
ofs << key << endl << f->second << endl;
}
}
ofs.close();
}
void PopulationDatabase::displa ySet(const PopulationSet& rset)
{
for (int i = 0; i < rset.size(); ++i)
displayRec(*rset[i]);
}
void PopulationDatabase::displa yAll()
{
displaySet(allPopulation);
}
int main()
{
while (true)
{
string file1, file2;
cout << "Enter Filename 1 ==>";
getline(cin, file1);
cout << "Enter Filename 2 ==>";
getline(cin, file2);
PopulationDatabase popDB(file1, file2);
int npops = popDB.readFromFiles();
if (npops == 0)
{
cout << "No entries in file " << file1 << " or file doesn't exist" << endl;
cout << "Do you want to continue? ";
string c;
getline (cin, c);
if (c == "y" || c == "Y")
continue;
break;
}
cout << "The database has " << npops << " entries" << endl << endl;
while (true)
{
cout << endl;
cout << "1 Display All" << endl;
cout << "2 Search Population" << endl;
cout << "3 Search Pop. Area" << endl;
cout << "4 Exit" << endl;
cout << endl << "Make Your Choice ==> ";
string c;
getline (cin, c);
switch (*c.begin())
{
case '1': popDB.displayAll(); break;
case '2':
{
cout << "Enter the Population you want to Search for ==> ";
string pop;
getline(cin, pop, '\n');
PopulationSet rset;
int n = popDB.searchPop(pop, rset);
if (n == 0)
cout << "Your search for " << pop << " doesn't match any record" << endl;
else
cout << "Your search matches " << n << " records" << endl << endl;
popDB.displaySet(rset);
popDB.writeOutputfile(rset );
break;
}
case '3':
{
cout << "Enter the Population Area you want to Search for ==> ";
string area;
getline(cin, area, '\n');
PopulationSet rset;
int n = popDB.searchArea(area, rset);
if (n == 0)
cout << "Your search for " << area << " doesn't match any record" << endl;
else
cout << "Your search matches " << n << " records" << endl << endl;
popDB.displaySet(rset);
break;
}
case '4':
return 1;
}
}
}
return 0;
}
Regards, Alex
#pragma warning (disable : 4786 4503)
#include <map>
#include <vector>
#include <string>
#include <sstream>
#include <fstream>
#include <iostream>
#include <windows.h>
using namespace std;
typedef vector<string> Population;
typedef vector<Population*> PopulationSet;
class PopulationDatabase
{
string filename1;
string filename2;
PopulationSet allPopulation;
map<string, PopulationSet > mapPopulation;
map<string, PopulationSet > mapPopArea;
map<string, string > mapHaploTypes;
public:
PopulationDatabase(const string& file1, const string& file2 )
: filename1(file1), filename2(file2) {}
~PopulationDatabase();
int readFromFiles();
int parsePopulationRecord( const string& line, Population& array);
int searchPop(const string& pop, PopulationSet& rset);
int searchArea(const string& area, PopulationSet& rset);
void displayRec(const Population& rec);
void displaySet(const PopulationSet& rset);
void displayAll();
void writeOutputfile(Population
};
PopulationDatabase::~Popul
{
for (int i = 0; i < allPopulation.size(); ++i)
delete allPopulation[i];
}
int PopulationDatabase::parseP
{
istringstream iss(line);
string entry;
string text;
bool literal = false;
while (getline(iss, entry, ','))
{
if (literal)
{
if (entry.rfind('"') == entry.length()-1)
{
literal = false;
entry.resize(entry.length(
text += entry;
entry = text;
}
else
{
text += entry;
text += ',';
continue;
}
}
else if (entry.find('"') == 0)
{
if (entry.rfind('"') == entry.length()-1)
entry = entry.substr(1, entry.length()-2);
else
{
literal = true;
text = entry;
text += ',';
continue;
}
}
array.push_back(entry);
}
return array.size();
}
int PopulationDatabase::readFr
{
ifstream ifs(filename1.c_str(), ios::binary | ios::in);
string line;
line.reserve(1024);
int count = 0;
char c;
while (ifs.read(&c, 1))
{
// check for end-of-line
if (c == '\r' || c == '\n')
{
if (line.empty())
continue;
Population* pPop = new Population;
pPop->reserve(30);
int n = parsePopulationRecord(line
if (n != 30)
{
cout << "Error: wrong number of commas in line " << count << endl;
continue; // ignore line
}
allPopulation.push_back(pP
PopulationSet& setP = mapPopulation[(*pPop)[2]];
// can be a new empty set
setP.push_back(pPop);
PopulationSet& setA = mapPopArea[(*pPop)[3]]; // get set from map PopArea
setA.push_back(pPop); // can be a new empty set
line = "";
}
else
line += c; // add char to line
}
ifs.close();
ifstream ifs2(filename2.c_str());
string key, data;
while (getline(ifs2, key) && getline(ifs2, data))
{
if (key.find('>') != 0)
continue;
mapHaploTypes[key] = data;
}
ifs2.close();
return allPopulation.size();
}
int PopulationDatabase::search
{
rset = mapPopulation[pop];
return rset.size();
}
int PopulationDatabase::search
{
rset = mapPopArea[area];
return rset.size();
}
void PopulationDatabase::displa
{
for (int i = 0; i < rec.size(); ++i)
{
cout << rec[i];
if (i < rec.size()-1) cout << ',';
else cout << endl;
}
}
void PopulationDatabase::writeO
{
static int haploTypeCols[] = { 4,5,7,8,10,11,13,14,16,17,
ofstream ofs("output.txt");
string pop;
for (int i = 0; i < rset.size(); ++i)
{
Population& rec = *rset[i];
if (rec[2] != pop)
{
pop = rec[2];
ofs << rec[2] << endl;
}
for (int j = 0; j < sizeof(haploTypeCols)/size
{
string key = ">";
key += rec[haploTypeCols[j]];
if (key == ">")
continue;
map<string, string>::iterator f = mapHaploTypes.find(key);
if (f == mapHaploTypes.end())
continue;
ofs << key << endl << f->second << endl;
}
}
ofs.close();
}
void PopulationDatabase::displa
{
for (int i = 0; i < rset.size(); ++i)
displayRec(*rset[i]);
}
void PopulationDatabase::displa
{
displaySet(allPopulation);
}
int main()
{
while (true)
{
string file1, file2;
cout << "Enter Filename 1 ==>";
getline(cin, file1);
cout << "Enter Filename 2 ==>";
getline(cin, file2);
PopulationDatabase popDB(file1, file2);
int npops = popDB.readFromFiles();
if (npops == 0)
{
cout << "No entries in file " << file1 << " or file doesn't exist" << endl;
cout << "Do you want to continue? ";
string c;
getline (cin, c);
if (c == "y" || c == "Y")
continue;
break;
}
cout << "The database has " << npops << " entries" << endl << endl;
while (true)
{
cout << endl;
cout << "1 Display All" << endl;
cout << "2 Search Population" << endl;
cout << "3 Search Pop. Area" << endl;
cout << "4 Exit" << endl;
cout << endl << "Make Your Choice ==> ";
string c;
getline (cin, c);
switch (*c.begin())
{
case '1': popDB.displayAll(); break;
case '2':
{
cout << "Enter the Population you want to Search for ==> ";
string pop;
getline(cin, pop, '\n');
PopulationSet rset;
int n = popDB.searchPop(pop, rset);
if (n == 0)
cout << "Your search for " << pop << " doesn't match any record" << endl;
else
cout << "Your search matches " << n << " records" << endl << endl;
popDB.displaySet(rset);
popDB.writeOutputfile(rset
break;
}
case '3':
{
cout << "Enter the Population Area you want to Search for ==> ";
string area;
getline(cin, area, '\n');
PopulationSet rset;
int n = popDB.searchArea(area, rset);
if (n == 0)
cout << "Your search for " << area << " doesn't match any record" << endl;
else
cout << "Your search matches " << n << " records" << endl << endl;
popDB.displaySet(rset);
break;
}
case '4':
return 1;
}
}
}
return 0;
}
ASKER
I placed the trim line function from the old program in jkr, but so that it would make zulu the first field but i frogot that the 2,5,8,11,14,17 columns are either blank or have infomrtion we dont want to search. Also the first row is of no help
So, using the original .csv file, we could just add '3' to the column numbers? Let me give that a try...
Yup, with changing that to
void ExtractHaplotypesForPopula tion(const char* pszHaplotypeFileName, const string& sPop, list<string>& lstResult) {
static int anFields[] = { 3,4,6,7,9,10,12,13,15,16,1 8,19, -1}; // TEST - remove the 1st -1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file " << pszHaplotypeFileName << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
cout << "RAW " << sLine << endl;
string sField = ExtractField (3,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i] + 2,sLine);
cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField );
}
}
}
}
it seems to work. Using
>DRB1*1401
12333
>DRB1*1501
fdfdfd
>A*300101
asdfdfdf
>Cw*020201
asfdfdsf
as 'file2' and your mhc.csv yields
Zulu
>DRB1*1401
12333
>DRB1*1501
fdfdfd
>A*300101
asdfdfdf
>Cw*020201
asfdfdsf
void ExtractHaplotypesForPopula
static int anFields[] = { 3,4,6,7,9,10,12,13,15,16,1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file " << pszHaplotypeFileName << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
cout << "RAW " << sLine << endl;
string sField = ExtractField (3,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i] + 2,sLine);
cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField
}
}
}
}
it seems to work. Using
>DRB1*1401
12333
>DRB1*1501
fdfdfd
>A*300101
asdfdfdf
>Cw*020201
asfdfdsf
as 'file2' and your mhc.csv yields
Zulu
>DRB1*1401
12333
>DRB1*1501
fdfdfd
>A*300101
asdfdfdf
>Cw*020201
asfdfdsf
ASKER
so should i use the the trimline fucntion, or just go as you have it
I'd go with the above - it is IMHO better to keep the data intact and in the original format as long as possible.
ASKER
the program still ends up crashing and windows has to exit it...
Well, it works here. At least what I posted above.... What code are you using exactly? (sorry for asking, but my crystal ball is on repair :o)
ASKER
lol....i used exactly the code you gave me with the additions
ASKER
This is what i have
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <map>
using namespace std;
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return false; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
string sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
void ExtractHaplotypesForPopula tion(const char* pszHaplotypeFileName, const string& sPop, list<string>& lstResult) {
static int anFields[] = { 3,4,6,7,9,10,12,13,15,16,1 8,19, -1}; // TEST - remove the 1st -1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file " << pszHaplotypeFileName << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
cout << "RAW " << sLine << endl;
string sField = ExtractField (3,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i] + 2,sLine);
cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField );
}
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
strType = strLine.substr(1);
// cout << " found " << strType << endl;
getline(is,strData);
cout << " adding " << strData << endl;;
mapProteins.insert(map<str ing,string >::value_t ype(strTyp e,strData) );
strData.erase(); // start out with a fresh buffer
}
}
}
void AssociateProteinsWithHaplo types(cons t char* pszFileName, string sPop, list<string>& lstTypes, map<string,string>& mapProteins) {
ofstream os (pszFileName);
string strLine;
string strData;
string strType;
if (!os.is_open()) {
cout << "Could not open output file" << endl;
return;
}
os << sPop << endl;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
map<string,string>::iterat or im = mapProteins.find(*il);
if ( mapProteins.end() != im) {
os << ">" << im->first << endl << im->second << endl;
}
}
}
int main () {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
cout << "Enter Population: " << endl;
cin >> sPop;
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
ReadProteins ( "hla_nuc.fasta.txt", mapProteins);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
return 0;
}
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <map>
using namespace std;
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return false; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
string sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
void ExtractHaplotypesForPopula
static int anFields[] = { 3,4,6,7,9,10,12,13,15,16,1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file " << pszHaplotypeFileName << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
cout << "RAW " << sLine << endl;
string sField = ExtractField (3,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i] + 2,sLine);
cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField
}
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
strType = strLine.substr(1);
// cout << " found " << strType << endl;
getline(is,strData);
cout << " adding " << strData << endl;;
mapProteins.insert(map<str
strData.erase(); // start out with a fresh buffer
}
}
}
void AssociateProteinsWithHaplo
ofstream os (pszFileName);
string strLine;
string strData;
string strType;
if (!os.is_open()) {
cout << "Could not open output file" << endl;
return;
}
os << sPop << endl;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
map<string,string>::iterat
if ( mapProteins.end() != im) {
os << ">" << im->first << endl << im->second << endl;
}
}
}
int main () {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
cout << "Enter Population: " << endl;
cin >> sPop;
ExtractHaplotypesForPopula
ReadProteins ( "hla_nuc.fasta.txt", mapProteins);
AssociateProteinsWithHaplo
return 0;
}
Runs fine where, giving the same result as above. What does "hla_nuc.fasta.txt" look like?
ASKER
>HLA:HLA00001 A*010101
ATGGCCGTCATGGCGCCCCGAACCCT CCTCCTGCTA CTCTCGGGGG CCCTGGCCCT GACCCAGACC TGGGCGGGCT CCCACTCCAT GAGGTATTTC TTCA
CATCCGTGTCCCGGCCCGGCCGCGGG GAGCCCCGCT TCATCGCCGT GGGCTACGTG GACGACACGC AGTTCGTGCG GTTCGACAGC GACGCCGCGA GCCA
GAAGATGGAGCCGCGGGCGCCGTGGA TAGAGCAGGA GGGGCCGGAG TATTGGGACC AGGAGACACG GAATATGAAG GCCCACTCAC AGACTGACCG AGCG
AACCTGGGGACCCTGCGCGGCTACTA CAACCAGAGC GAGGACGGTT CTCACACCAT CCAGATAATG TATGGCTGCG ACGTGGGGCC GGACGGGCGC TTCC
TCCGCGGGTACCGGCAGGACGCCTAC GACGGCAAGG ATTACATCGC CCTGAACGAG GACCTGCGCT CTTGGACCGC GGCGGACATG GCAGCTCAGA TCAC
CAAGCGCAAGTGGGAGGCGGTCCATG CGGCGGAGCA GCGGAGAGTC TACCTGGAGG GCCGGTGCGT GGACGGGCTC CGCAGATACC TGGAGAACGG GAAG
GAGACGCTGCAGCGCACGGACCCCCC CAAGACACAT ATGACCCACC ACCCCATCTC TGACCATGAG GCCACCCTGA GGTGCTGGGC CCTGGGCTTC TACC
CTGCGGAGATCACACTGACCTGGCAG CGGGATGGGG AGGACCAGAC CCAGGACACG GAGCTCGTGG AGACCAGGCC TGCAGGGGAT GGAACCTTCC AGAA
GTGGGCGGCTGTGGTGGTGCCTTCTG GAGAGGAGCA GAGATACACC TGCCATGTGC AGCATGAGGG TCTGCCCAAG CCCCTCACCC TGAGATGGGA GCTG
TCTTCCCAGCCCACCATCCCCATCGT GGGCATCATT GCTGGCCTGG TTCTCCTTGG AGCTGTGATC ACTGGAGCTG TGGTCGCTGC CGTGATGTGG AGGA
GGAAGAGCTCAGATAGAAAAGGAGGG AGTTACACTC AGGCTGCAAG CAGTGACAGT GCCCAGGGCT CTGATGTGTC TCTCACAGCT TGTAAAGTGT GA
>HLA:HLA01244 A*010102
GCTCCCACTCCATGAGGTATTTCTTC ACATCCGTGT CCCGGCCCGG CCGCGGGGAG CCCCGCTTCA TCGCCGTGGG CTACGTGGAC GACACGCAGT TCGT
GCGGTTCGACAGCGACGCCGCGAGCC AGAAGATGGA GCCGCGGGCG CCGTGGATAG AGCAGGAGGG GCCGGAGTAT TGGGACCAGG AGACACGGAA TATG
AAGGCCCACTCACAGACTGACCGAGC GAACCTGGGG ACCCTGCGCG GCTACTACAA CCAGAGCGAG GACGGTTCTC ACACCATCCA GATAATGTAT GGCT
GCGACGTGGGGCCGGACGGGCGCTTC CTCCGCGGGT ACCGGCAGGA CGCCTACGAC GGCAAGGATT ACATCGCCCT GAACGAGGAC CTGCGCTCTT GGAC
CGCGGCGGACATGGCAGCTCAGATTA CCAAGCGCAA GTGGGAGGCG GTCCATGCGG CGGAGCAGCG GAGAGTCTAC CTGGAGGGCC GGTGCGTGGA CGGG
CTCCGCAGATACCTGGAGAACGGGAA GGAGACGCTG CAGCGCACGG
>HLA:HLA01971 A*010103
ATGGCCGTCATGGCGCCCCGAACCCT CCTCCTGCTA CTCTCGGGGG CCCTGGCCCT GACCCAGACC TGGGCGGGCT CCCACTCCAT GAGGTATTTC TTCA
CATCCGTGTCCCGGCCCGGCCGCGGG GAGCCCCGCT TCATCGCCGT GGGCTACGTG GACGACACGC AGTTCGTGCG GTTCGACAGC GACGCCGCGA GCCA
GAAGATGGAGCCGCGGGCGCCGTGGA TAGAGCAGGA GGGGCCGGAG TATTGGGACC AGGAGACACG GAATATGAAG GCCCACTCAC AGACTGACCG AGCG
AACCTGGGGACCCTGCGCGGCTACTA CAACCAGAGC GAGGACGGTT CTCACACCAT CCAGATAATG TATGGCTGCG ACGTGGGGCC GGACGGGCGC TTCC
TCCGCGGGTACCGGCAGGACGCCTAC GACGGCAAGG ATTACATCGC CCTGAACGAG GACCTGCGCT CTTGGACCGC GGCGGACATG GCAGCTCAGA TCAC
CAAGCGCAAGTGGGAGGCGGTCCATG CGGCGGAGCA GCGGAGAGTC TACCTGGAGG GCCGGTGCGT GGACGGGCTC CGCAGATACC TGGAGAACGG GAAG
GAGACGCTGCAGCGCACTGACCCCCC CAAGACACAT ATGACCCACC ACCCCATCTC TGACCATGAG GCCACCCTGA GGTGCTGGGC CCTGGGCTTC TACC
CTGCGGAGATCACACTGACCTGGCAG CGGGATGGGG AGGACCAGAC CCAGGACACG GAGCTCGTGG AGACCAGGCC TGCAGGGGAT GGAACCTTCC AGAA
GTGGGCGGCTGTGGTGGTGCCTTCTG GAGAGGAGCA GAGATACACC TGCCATGTGC AGCATGAGGG TCTGCCCAAG CCCCTCACCC TGAGATGGG
>HLA:HLA00002 A*0102
ATGGCCGTCATGGCGCCCCGAACCCT CCTCCTGCTA CTCTCGGGGG CCCTGGCCCT GACCCAGACC TGGGCGGGCT CCCACTCCAT GAGGTATTTC TCCA
CATCCGTGTCCCGGCCCGGCAGTGGA GAGCCCCGCT TCATCGCAGT GGGCTACGTG GACGACACGC AGTTCGTGCG GTTCGACAGC GACGCCGCGA GCCA
GAAGATGGAGCCGCGGGCGCCGTGGA TAGAGCAGGA GGGGCCGGAG TATTGGGACC AGGAGACACG GAATATGAAG GCCCACTCAC AGACTGACCG AGCG
AACCTGGGGACCCTGCGCGGCTACTA CAACCAGAGC GAGGACGGTT CTCACACCAT CCAGATAATG TATGGCTGCG ACGTGGGGCC GGACGGGCGC TTCC
TCCGCGGGTACCGGCAGGACGCCTAC GACGGCAAGG ATTACATCGC CCTGAACGAG GACCTGCGCT CTTGGACCGC GGCGGACATG GCAGCTCAGA TCAC
CAAGCGCAAGTGGGAGGCGGTCCATG CGGCGGAGCA GCGGAGAGTC TACCTGGAGG GCCGGTGCGT GGACGGGCTC CGCAGATACC TGGAGAACGG GAAG
GAGACGCTGCAGCGCACGGACCCCCC CAAGACACAT ATGACCCACC ACCCCATCTC TGACCATGAG GCCACCCTGA GGTGCTGGGC CCTGGGCTTC TACC
CTGCGGAGATCACACTGACCTGGCAG CGGGATGGGG AGGACCAGAC CCAGGACACG GAGCTCGTGG AGACCAGGCC TGCAGGGGAT GGAACCTTCC AGAA
GTGGGCGGCTGTGGTGGTGCCTTCTG GAGAGGAGCA GAGATACACC TGCCATGTGC AGCATGAGGG TCTGCCCAAG CCCCTCACCC TGAGATGGGA GCTG
TCTTCCCAGCCCACCATCCCCATCGT GGGCATCATT GCTGGCCTGG TTCTCCTTGG AGCTGTGATC ACTGGAGCTG TGGTCGCTGC CGTGATGTGG AGGA
GGAAGAGCTCAGATAGAAAAGGAGGG AGTTACACTC AGGCTGCAAG CAGTGACAGT GCCCAGGGCT CTGATGTGTC TCTCACAGCT TGTAAAGTGT GA
>HLA:HLA00003 A*0103
GCTCCCACTCCATGAGGTATTTCTTC ACATCCGTGT CCCGGCCCGG CCGCGGGGAG CCCCGCTTCA TCGCCGTGGG CTACGTGGAC GACACGCAGT TCGT
GCGGTTCGACAGCGACGCCGCGAGCC AGAAGATGGA GCCGCGGGCG CCGTGGATAG AGCAGGAGGG GCCGGAGTAT TGGGACCAGG AGACACGGAA TATG
AAGGCCCACTCACAGACTGACCGAGC GAACCTGGGG ACCCTGCGCG GCTACTACAA CCAGAGCGAG GACGGTTCTC ACACCATCCA GATGATGTAT GGCT
GCGACGTGGGGCCGGACGGGCGCTTC CTCCGCGGGT ACCGGCAGGA CGCCTACGAC GGCAAGGATT ACATCGCCCT GAACGAGGAC CTGCGCTCTT GGAC
CGCGGCGGACATGGCAGCTCAGATCA CCAAGCGCAA GTGGGAGGCG GTCCATGCGG CGGAGCAGCG GAGAGTCTAC CTGGAGGGCC GGTGCGTGGA CGGG
CTCCGCAGATACCTGGAGAACGGGAA GGAGACGCTG CAGCGCACGG
>HLA:HLA00004 A*0104N
ATGGCCGTCATGGCGCCCCGAACCCT CCTCCTGCTA CTCTCGGGGG CCCTGGCCCT GACCCAGACC TGGGCGGGCT CCCACTCCAT GAGGTATTTC TTCA
CATCCGTGTCCCGGCCCGGCCGCGGG GAGCCCCGCT TCATCGCCGT GGGCTACGTG GACGACACGC AGTTCGTGCG GTTCGACAGC GACGCCGCGA GCCA
GAAGATGGAGCCGCGGGCGCCGTGGA TAGAGCAGGA GGGGCCGGAG TATTGGGACC AGGAGACACG GAATATGAAG GCCCACTCAC AGACTGACCG AGCG
AACCTGGGGACCCTGCGCGGCTACTA CAACCAGAGC GAGGACGGTT CTCACACCAT CCAGATAATG TATGGCTGCG ACGTGGGGCC GGACGGGCGC TTCC
TCCGCGGGTACCGGCAGGACGCCTAC GACGGCAAGG ATTACATCGC CCTGAACGAG GACCTGCGCT CTTGGACCGC GGCGGACATG GCAGCTCAGA TCAC
CAAGCGCAAGTGGGAGGCGGTCCATG CGGCGGAGCA GCGGAGAGTC TACCTGGAGG GCCGGTGCGT GGACGGGCTC CGCAGATACC TGGAGAACGG GAAG
GAGACGCTGCAGCGCACGGACCCCCC CCAAGACACA TATGACCCAC CACCCCATCT CTGACCATGA GGCCACCCTG AGGTGCTGGG CCCTGGGCTT CTAC
CCTGCGGAGATCACACTGACCTGGCA GCGGGATGGG GAGGACCAGA CCCAGGACAC GGAGCTCGTG GAGACCAGGC CTGCAGGGGA TGGAACCTTC CAGA
AGTGGGCGGCTGTGGTGGTGCCTTCT GGAGAGGAGC AGAGATACAC CTGCCATGTG CAGCATGAGG GTCTGCCCAA GCCCCTCACC CTGAGATGGG AGCT
GTCTTCCCAGCCCACCATCCCCATCG TGGGCATCAT TGCTGGCCTG GTTCTCCTTG GAGCTGTGAT CACTGGAGCT GTGGTCGCTG CCGTGATGTG GAGG
AGGAAGAGCTCAGATAGAAAAGGAGG GAGTTACACT CAGGCTGCAA GCAGTGACAG TGCCCAGGGC TCTGATGTGT CTCTCACAGC TTGTAAAGTG TGA
>HLA:HLA01031 A*0106
GCTCCCACTCCATGAGGTATTTCTTC ACATCCGTGT CCCGGCCCGG CCGCGGGGAG CCCCGCTTCA TCGCCGTGGG CTACGTGGAC GACACGCAGT TCGT
GCGGTTCGACAGCGACGCCGCGAGCC AGAAGATGGA GCCGCGGGCG CCGTGGATAG AGCAGGAGGG GCCGGAGTAT TGGGACCAGG AGACACGGAA TATG
AAGGCCCACTCACAGACTGACCGAGC GAACCTGGGG ACCCTGCGCG GCTACTACAA CCAGAGCGAG GACGGTTCTC ACACCATCCA GATAATGTAT GGCT
GCGACGTGGGGCCGGACGGGCGCTTC CTCCGCGGGT ACCGGCAGGA CGCCTACGAC GGCAAGGATT ACATCGCCCT GAACGAGGAC CTGCGCTCTT GGAC
CGCGGCGGACATGGCAGCTCAGATCA CCAAGCGCAA GTGGGAGGCG GTCCATGCGG CGGAGCAGTT GAGAGCCTAC CTGGAGGGCC GGTGCGTGGA CGGG
CTCCGCAGATACCTGGAGAACGGGAA GGAGACGCTG CAGCGCACGG
>HLA:HLA01208 A*0107
GCTCCCACTCCATGAGGTATTTCTTC ACATCCGTGT CCCGGCCCGG CCGCGGGGAG CCCCGCTTCA TCGCCGTGGG CTACGTGGAC GACACGCAGT TCGT
GCGGTTCGACAGCGACGCCGCGAGCC AGAAGATGGA GCCGCGGGCG CCGTGGATAG AGCAGGAGAG GCCTGAGTAT TGGGACCAGG AGACACGGAA TGTG
AAGGCCCACTCACAGACTGACCGAGA GAACCTGGGG ACCCTGCGCG GCTACTACAA CCAGAGCGAG GCCGGTTCTC ACACCATCCA GATAATGTAT GGCT
GCGACGTGGGGCCGGACGGGCGCTTC CTCCGCGGGT ACCGGCAGGA CGCCTACGAC GGCAAGGATT ACATCGCCCT GAACGAGGAC CTGCGCTCTT GGAC
CGCGGCGGACATGGCAGCTCAGATCA CCAAGCGCAA GTGGGAGGCG GTCCATGCGG CGGAGCAGCG GAGAGTCTAC CTGGAGGGCC GGTGCGTGGA CGGG
CTCCGCAGATACCTGGAGAACGGGAA GGAGACGCTG CAGCGCACGG
ATGGCCGTCATGGCGCCCCGAACCCT
CATCCGTGTCCCGGCCCGGCCGCGGG
GAAGATGGAGCCGCGGGCGCCGTGGA
AACCTGGGGACCCTGCGCGGCTACTA
TCCGCGGGTACCGGCAGGACGCCTAC
CAAGCGCAAGTGGGAGGCGGTCCATG
GAGACGCTGCAGCGCACGGACCCCCC
CTGCGGAGATCACACTGACCTGGCAG
GTGGGCGGCTGTGGTGGTGCCTTCTG
TCTTCCCAGCCCACCATCCCCATCGT
GGAAGAGCTCAGATAGAAAAGGAGGG
>HLA:HLA01244 A*010102
GCTCCCACTCCATGAGGTATTTCTTC
GCGGTTCGACAGCGACGCCGCGAGCC
AAGGCCCACTCACAGACTGACCGAGC
GCGACGTGGGGCCGGACGGGCGCTTC
CGCGGCGGACATGGCAGCTCAGATTA
CTCCGCAGATACCTGGAGAACGGGAA
>HLA:HLA01971 A*010103
ATGGCCGTCATGGCGCCCCGAACCCT
CATCCGTGTCCCGGCCCGGCCGCGGG
GAAGATGGAGCCGCGGGCGCCGTGGA
AACCTGGGGACCCTGCGCGGCTACTA
TCCGCGGGTACCGGCAGGACGCCTAC
CAAGCGCAAGTGGGAGGCGGTCCATG
GAGACGCTGCAGCGCACTGACCCCCC
CTGCGGAGATCACACTGACCTGGCAG
GTGGGCGGCTGTGGTGGTGCCTTCTG
>HLA:HLA00002 A*0102
ATGGCCGTCATGGCGCCCCGAACCCT
CATCCGTGTCCCGGCCCGGCAGTGGA
GAAGATGGAGCCGCGGGCGCCGTGGA
AACCTGGGGACCCTGCGCGGCTACTA
TCCGCGGGTACCGGCAGGACGCCTAC
CAAGCGCAAGTGGGAGGCGGTCCATG
GAGACGCTGCAGCGCACGGACCCCCC
CTGCGGAGATCACACTGACCTGGCAG
GTGGGCGGCTGTGGTGGTGCCTTCTG
TCTTCCCAGCCCACCATCCCCATCGT
GGAAGAGCTCAGATAGAAAAGGAGGG
>HLA:HLA00003 A*0103
GCTCCCACTCCATGAGGTATTTCTTC
GCGGTTCGACAGCGACGCCGCGAGCC
AAGGCCCACTCACAGACTGACCGAGC
GCGACGTGGGGCCGGACGGGCGCTTC
CGCGGCGGACATGGCAGCTCAGATCA
CTCCGCAGATACCTGGAGAACGGGAA
>HLA:HLA00004 A*0104N
ATGGCCGTCATGGCGCCCCGAACCCT
CATCCGTGTCCCGGCCCGGCCGCGGG
GAAGATGGAGCCGCGGGCGCCGTGGA
AACCTGGGGACCCTGCGCGGCTACTA
TCCGCGGGTACCGGCAGGACGCCTAC
CAAGCGCAAGTGGGAGGCGGTCCATG
GAGACGCTGCAGCGCACGGACCCCCC
CCTGCGGAGATCACACTGACCTGGCA
AGTGGGCGGCTGTGGTGGTGCCTTCT
GTCTTCCCAGCCCACCATCCCCATCG
AGGAAGAGCTCAGATAGAAAAGGAGG
>HLA:HLA01031 A*0106
GCTCCCACTCCATGAGGTATTTCTTC
GCGGTTCGACAGCGACGCCGCGAGCC
AAGGCCCACTCACAGACTGACCGAGC
GCGACGTGGGGCCGGACGGGCGCTTC
CGCGGCGGACATGGCAGCTCAGATCA
CTCCGCAGATACCTGGAGAACGGGAA
>HLA:HLA01208 A*0107
GCTCCCACTCCATGAGGTATTTCTTC
GCGGTTCGACAGCGACGCCGCGAGCC
AAGGCCCACTCACAGACTGACCGAGA
GCGACGTGGGGCCGGACGGGCGCTTC
CGCGGCGGACATGGCAGCTCAGATCA
CTCCGCAGATACCTGGAGAACGGGAA
Even that works w/o any crash. Yet there are no results, since the file format is obviously *different* from what you specified.
ASKER
my mistakes...
Well, that's not a big deal, you can take care of it using
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
nPos = strLine.find(' ');
if ( -1 == nPos) continue;
strType = strLine.substr(nPos);
cout << " found " << strType << endl;
while ( '>' != (char) is.peek() && !is.eof()) {
getline(is,strLine);
cout << " data " << strLine << endl;
strData += strLine;
}
cout << " adding " << strData << endl;;
mapProteins.insert(map<str ing,string >::value_t ype(strTyp e,strData) );
strData.erase(); // start out with a fresh buffer
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
nPos = strLine.find(' ');
if ( -1 == nPos) continue;
strType = strLine.substr(nPos);
cout << " found " << strType << endl;
while ( '>' != (char) is.peek() && !is.eof()) {
getline(is,strLine);
cout << " data " << strLine << endl;
strData += strLine;
}
cout << " adding " << strData << endl;;
mapProteins.insert(map<str
strData.erase(); // start out with a fresh buffer
}
}
}
ASKER
its still crashing i think the problem is somehwere in the read protein function, could it be something with the parenthesis
ASKER
you cut and pasted the code that i put up there and it worked for you? im confused....
What is the output before the crash?
ASKER
it shows raw, then found then crashes
>> you cut and pasted the code that i put up there and it worked for you? im confused....
Yes, exactly.
Could you post the first 10 lines of your 'mhc.csv'?
Yes, exactly.
Could you post the first 10 lines of your 'mhc.csv'?
ASKER
Source Subject Population Pop. Area HLA-A 1 HLA-A 2 mismatch HLA-B 1 HLA-B 2 mismatch HLA-C 1 HLA-C 2 mismatch HLA-DRB1 1 HLA-DRB1 2 mismatch HLA-DQA1 1 HLA-DQA1 2 mismatch HLA-DQB1 1 HLA-DQB1 2 mismatch HLA-DPA1 1 HLA-DPA1 2 mismatch HLA-DPB1 1 HLA-DPB1 2 mismatch Report Authors
ZAFHAM 13W035395 Zulu Sub-Saharan Africa DRB1*1401 DRB1*1501 DQA1*0102 DQA1*0102 DQB1*0602 DQB1*0602 DPB1*0201 DPB1*040101 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W036237 Zulu Sub-Saharan Africa A*0301 A*300101 B*1503 B*4201 Cw*020201 Cw*1701 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W036248 Zulu Sub-Saharan Africa A*0205 A*290201 B*1801 B*4403 Cw*0202 Cw*0714 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W035619 Zulu Sub-Saharan Africa A*3002 A*6802 B*0702 B*3910 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W035667 Zulu Sub-Saharan Africa A*0205 A*290201 B*1401 B*1503 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W035411 Zulu Sub-Saharan Africa DRB1*0802 DRB1*1104 DQA1*0102 DQA1*0401 DQB1*0301 DQB1*0602 DPB1*0201 DPB1*040101 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W036348 Zulu Sub-Saharan Africa A*0211 A*330301 B*4403 B*520101 Cw*0701 Cw*120203 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W036259 Zulu Sub-Saharan Africa A*3402 A*6802 B*1510 B*4403 Cw*0407 Cw*0801 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W035437 Zulu Sub-Saharan Africa DRB1*0701 DRB1*0701 DQA1*0201 DQA1*030101 DQB1*020101 DQB1*020101 DPB1*040101 DPB1*1501 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W035395 Zulu Sub-Saharan Africa DRB1*1401 DRB1*1501 DQA1*0102 DQA1*0102 DQB1*0602 DQB1*0602 DPB1*0201 DPB1*040101 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W036237 Zulu Sub-Saharan Africa A*0301 A*300101 B*1503 B*4201 Cw*020201 Cw*1701 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W036248 Zulu Sub-Saharan Africa A*0205 A*290201 B*1801 B*4403 Cw*0202 Cw*0714 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W035619 Zulu Sub-Saharan Africa A*3002 A*6802 B*0702 B*3910 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W035667 Zulu Sub-Saharan Africa A*0205 A*290201 B*1401 B*1503 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W035411 Zulu Sub-Saharan Africa DRB1*0802 DRB1*1104 DQA1*0102 DQA1*0401 DQB1*0301 DQB1*0602 DPB1*0201 DPB1*040101 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W036348 Zulu Sub-Saharan Africa A*0211 A*330301 B*4403 B*520101 Cw*0701 Cw*120203 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W036259 Zulu Sub-Saharan Africa A*3402 A*6802 B*1510 B*4403 Cw*0407 Cw*0801 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
ZAFHAM 13W035437 Zulu Sub-Saharan Africa DRB1*0701 DRB1*0701 DQA1*0201 DQA1*030101 DQB1*020101 DQB1*020101 DPB1*040101 DPB1*1501 Zulu from Natal Province, South Africa MG Hammond, D Middleton and D Anley
THERE ARE NO COMMAS IN THAT FILE! *smashingforeheadondesk*
*gnarf* OK, that feels better now. That's what I used:
Source,Subject,Population, Pop. Area,HLA-A 1,HLA-A 2,mismatch,HLA-B 1,HLA-B 2,mismatch,HLA-C 1,HLA-C 2,mismatch,HLA-DRB1 1,HLA-DRB1 2,mismatch,HLA-DQA1 1,HLA-DQA1 2,mismatch,HLA-DQB1 1,HLA-DQB1 2,mismatch,HLA-DPA1 1,HLA-DPA1 2,mismatch,HLA-DPB1 1,HLA-DPB1 2,mismatch,Report,Authors
ZAFHAM,13W035395,Zulu,Sub- Saharan Africa,,,,,,,,,,DRB1*1401, DRB1*1501, ,DQA1*0102 ,DQA1*0102 ,,DQB1*060 2,DQB1*060 2,,,,,DPB1 *0201,DPB1 *040101,," Zulu from Natal Province, South Africa ","MG Hammond, D Middleton and D Anley "
ZAFHAM,13W036237,Zulu,Sub- Saharan Africa,A*0301,A*300101,,B* 1503,B*420 1,,Cw*0202 01,Cw*1701 ,,,,,,,,,, ,,,,,,,"Zu lu from Natal Province, South Africa ","MG Hammond, D Middleton and D Anley "
Source,Subject,Population,
ZAFHAM,13W035395,Zulu,Sub-
ZAFHAM,13W036237,Zulu,Sub-
>>>> THERE ARE NO COMMAS IN THAT FILE
As there could be spaces in column 4 'Population Area', the file couldn't be evaluated properly if there are no commas.
Regards
As there could be spaces in column 4 'Population Area', the file couldn't be evaluated properly if there are no commas.
Regards
ASKER
wait they are commas i opened it in xcel, but its a csv file
>> wait they are commas
The above was your original mhc.csv from http:Q_21457847.html
What you posted now has one comma per line, and that's definitely in the wrong place.
The above was your original mhc.csv from http:Q_21457847.html
What you posted now has one comma per line, and that's definitely in the wrong place.
Oh, and BTW, the reason for the crash is kinda odd - there was an error in 'ExtractField()' - the following will avoid a crash, but without commas in mhc.csv, it cannot work.
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
string sField = "";
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return sField; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
string sField = "";
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return sField; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
ASKER
when u open a csv file in excel it takes away all the commas cuz its in cells, however, the actually file if opened in a text editor will show commas
Well, then try the above change, or
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <map>
using namespace std;
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
string sField = "";
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return sField; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
void ExtractHaplotypesForPopula tion(const char* pszHaplotypeFileName, const string& sPop, list<string>& lstResult) {
static int anFields[] = { 3,4,6,7,9,10,12,13,15,16,1 8,19, -1}; // TEST - remove the 1st -1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file " << pszHaplotypeFileName << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
cout << "RAW " << sLine << endl;
string sField = ExtractField (3,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i] + 2,sLine);
cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField );
}
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
nPos = strLine.find(' ');
if ( -1 == nPos) continue;
strType = strLine.substr(nPos);
cout << " found " << strType << endl;
// getline(is,strData);
while ( '>' != (char) is.peek() && !is.eof()) {
getline(is,strLine);
cout << " data " << strLine << endl;
strData += strLine;
}
cout << " adding " << strData << endl;;
mapProteins.insert(map<str ing,string >::value_t ype(strTyp e,strData) );
strData.erase(); // start out with a fresh buffer
}
}
}
void AssociateProteinsWithHaplo types(cons t char* pszFileName, string sPop, list<string>& lstTypes, map<string,string>& mapProteins) {
ofstream os (pszFileName);
string strLine;
string strData;
string strType;
if (!os.is_open()) {
cout << "Could not open output file" << endl;
return;
}
os << sPop << endl;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
map<string,string>::iterat or im = mapProteins.find(*il);
if ( mapProteins.end() != im) {
os << ">" << im->first << endl << im->second << endl;
}
}
}
int main () {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
cout << "Enter Population: " << endl;
cin >> sPop;
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
ReadProteins ( "hla_nuc.fasta.txt", mapProteins);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
return 0;
}
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <map>
using namespace std;
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
string sField = "";
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return sField; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
void ExtractHaplotypesForPopula
static int anFields[] = { 3,4,6,7,9,10,12,13,15,16,1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file " << pszHaplotypeFileName << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
cout << "RAW " << sLine << endl;
string sField = ExtractField (3,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i] + 2,sLine);
cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField
}
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
nPos = strLine.find(' ');
if ( -1 == nPos) continue;
strType = strLine.substr(nPos);
cout << " found " << strType << endl;
// getline(is,strData);
while ( '>' != (char) is.peek() && !is.eof()) {
getline(is,strLine);
cout << " data " << strLine << endl;
strData += strLine;
}
cout << " adding " << strData << endl;;
mapProteins.insert(map<str
strData.erase(); // start out with a fresh buffer
}
}
}
void AssociateProteinsWithHaplo
ofstream os (pszFileName);
string strLine;
string strData;
string strType;
if (!os.is_open()) {
cout << "Could not open output file" << endl;
return;
}
os << sPop << endl;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
map<string,string>::iterat
if ( mapProteins.end() != im) {
os << ">" << im->first << endl << im->second << endl;
}
}
}
int main () {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
cout << "Enter Population: " << endl;
cin >> sPop;
ExtractHaplotypesForPopula
ReadProteins ( "hla_nuc.fasta.txt", mapProteins);
AssociateProteinsWithHaplo
return 0;
}
ASKER
ok the program runs all the way through and makes the result file, except that it just says Zulu and thats it
ASKER CERTIFIED SOLUTION
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Did you try the program I posted above? You simply need to create an empty Windows Console application in VC6 and add the file posted to the project tree.
The program ask for the file names and shows a menu where you can make queries. Choose '2' Population Query and type 'Zulu' . The output is written to file 'output.txt' on the current directory.
I wonder if it would output more than 'Zulu'.
Regards, Alex
The program ask for the file names and shows a menu where you can make queries. Choose '2' Population Query and type 'Zulu' . The output is written to file 'output.txt' on the current directory.
I wonder if it would output more than 'Zulu'.
Regards, Alex
When the data simply does not match, I'm sure it won't. The actual test data that was given yields correct results anyway...
>>>>> When the data simply does not match, I'm sure it won't.
Yes, I agree. It seems the spec of file 1 and/or file 2 still doesn't match the files actually used for testing.
Regards
Yes, I agree. It seems the spec of file 1 and/or file 2 still doesn't match the files actually used for testing.
Regards
ASKER
its outputting the same thing on the screen
So, there's no match... one possible outcome of a search.
Try to feed it with test data, e.g.
>HLA:HLA01031 DRB1*1401
12333
>HLA:HLA01032 DRB1*1501
fdfdfd
>HLA:HLA01033 A*300101
asdfdfdf
>HLA:HLA01034 Cw*020201
asfdfdsf
as hla_nuc.fasta.txt and you'll see results.
Try to feed it with test data, e.g.
>HLA:HLA01031 DRB1*1401
12333
>HLA:HLA01032 DRB1*1501
fdfdfd
>HLA:HLA01033 A*300101
asdfdfdf
>HLA:HLA01034 Cw*020201
asfdfdsf
as hla_nuc.fasta.txt and you'll see results.
ASKER
same result, it will say iut searched the appropiate terms, but then in the output fil still just zulu, does it matter if the 2nd file is not a txt file
ASKER
actually it shouldnt because thus it finds the matches in the hla_nuc file. so i have no idea why its not matchin it up
>>does it matter if the 2nd file is not a txt file
What do you mean by "is not a text file"? You are posting it here, so it's text.
BTW, one problem with the transition from format spec
> DRB1*1401
to
>HLA:HLA01031 DRB1*1401
The space was not properly taken care of. With the above test data and
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
nPos = strLine.find(' ');
if ( -1 == nPos) continue;
strType = strLine.substr(nPos + 1);
cout << " found " << strType << endl;
// getline(is,strData);
while ( '>' != (char) is.peek() && !is.eof()) {
getline(is,strLine);
cout << " data " << strLine << endl;
strData += strLine;
}
cout << " adding " << strData << endl;;
mapProteins.insert(map<str ing,string >::value_t ype(strTyp e,strData) );
strData.erase(); // start out with a fresh buffer
}
}
}
the result is
looking up DRB1*1401
found entry: DRB1*1401
looking up DRB1*1501
found entry: DRB1*1501
looking up DQA1*0102
looking up DQA1*0102
looking up DQB1*0602
looking up DQB1*0602
looking up A*0301
looking up A*300101
found entry: A*300101
looking up B*1503
looking up B*4201
looking up Cw*020201
found entry: Cw*020201
looking up Cw*1701
We searched the following entries:
A*300101
Cw*020201
DRB1*1401
DRB1*1501
and
Zulu
>DRB1*1401
12333
>DRB1*1501
fdfdfd
>A*300101
asdfdfdf
>Cw*020201
asfdfdsf
You should have spotted that, since you said that you are trying to understand code you've been given :o)
What do you mean by "is not a text file"? You are posting it here, so it's text.
BTW, one problem with the transition from format spec
> DRB1*1401
to
>HLA:HLA01031 DRB1*1401
The space was not properly taken care of. With the above test data and
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
nPos = strLine.find(' ');
if ( -1 == nPos) continue;
strType = strLine.substr(nPos + 1);
cout << " found " << strType << endl;
// getline(is,strData);
while ( '>' != (char) is.peek() && !is.eof()) {
getline(is,strLine);
cout << " data " << strLine << endl;
strData += strLine;
}
cout << " adding " << strData << endl;;
mapProteins.insert(map<str
strData.erase(); // start out with a fresh buffer
}
}
}
the result is
looking up DRB1*1401
found entry: DRB1*1401
looking up DRB1*1501
found entry: DRB1*1501
looking up DQA1*0102
looking up DQA1*0102
looking up DQB1*0602
looking up DQB1*0602
looking up A*0301
looking up A*300101
found entry: A*300101
looking up B*1503
looking up B*4201
looking up Cw*020201
found entry: Cw*020201
looking up Cw*1701
We searched the following entries:
A*300101
Cw*020201
DRB1*1401
DRB1*1501
and
Zulu
>DRB1*1401
12333
>DRB1*1501
fdfdfd
>A*300101
asdfdfdf
>Cw*020201
asfdfdsf
You should have spotted that, since you said that you are trying to understand code you've been given :o)
ASKER
perfect, one more thing to ask, is there a way to make an option so that the user can specify if he wants to see the doubles or just show matches once
Win which of the files do you expect the 'doubles'?
ASKER
well there are double occurences in the mhc.csv file
ASKER
and also a way to print out all the populations at once with there matches
ASKER
and also a way to print out all the populations at once with their matches in seperate files...so basically the first thing we worked on, but massed produced for all populations without having to specify the populations. i am also working on the switch for it right now
The 'mass produced' version would be to change 'main()' to
void Process ( string sPopulation, char* pszFastaFile, char* pszResult) {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
ExtractHaplotypesForPopula tion ( "mhc.csv", sPopulation, lstTypes);
ReadProteins ( pszFastaFile, mapProteins);
AssociateProteinsWithHaplo types ( pszResult, sPop, lstTypes, mapProteins);
}
To remove the duplicates, just add a function like
void RemoveDuplicates (list<string>& lstTypes){
map<string,string> tmp;
map<string,string>::iterat or im;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
tmp.insert(map<string,stri ng>::value _type(*il, *il));
}
lstTypes.erase();
for (im = tmp.begin(); tmp.end() != im; ++im) lstTypes.push_back(im->fir st);
}
and call that when you don't want any duplicates (right after 'ExtractHaplotypesForPopul ation()')
void Process ( string sPopulation, char* pszFastaFile, char* pszResult) {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
ExtractHaplotypesForPopula
ReadProteins ( pszFastaFile, mapProteins);
AssociateProteinsWithHaplo
}
To remove the duplicates, just add a function like
void RemoveDuplicates (list<string>& lstTypes){
map<string,string> tmp;
map<string,string>::iterat
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
tmp.insert(map<string,stri
}
lstTypes.erase();
for (im = tmp.begin(); tmp.end() != im; ++im) lstTypes.push_back(im->fir
}
and call that when you don't want any duplicates (right after 'ExtractHaplotypesForPopul
ASKER
it says it takes no parameter when i use the call line RemoveDuplicates(1st Types)
I have mentioned, something similar earlier already, but: could you post the relevant code? If you had already done that, I might have been able to provide the answer to the proble right now :o)
ASKER
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <map>
using namespace std;
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
string sField = "";
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return sField; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
void RemoveDuplicates (list<string>& lstTypes){
map<string,string> tmp;
map<string,string>::iterat or im;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
tmp.insert(map<string,stri ng>::value _type(*il, *il));
}
lstTypes.erase();
for (im = tmp.begin(); tmp.end() != im; ++im) lstTypes.push_back(im->fir st);
}
string GetPopulation(string& sPop)
{
char* apszPopulations[] = {"Algerian 99","American Samoa","AmerIndian","Ami 97","Arab Durze","Atayal","Bari","Br azilian"," Brazilian (Af Eu)","Bulgarian","Bunun"," Burait","C ape York","Chaouya","Croatian" ,"Cuban (Af Eu)","Cuban (Eu)","Czech","Doggon","Fi lipino","F inn 90","Georgian","Groote Eylandt","Guarani-Kaiowa", "Guarani-N andewa","H akka","Han -Chinese 149","Han Chinese 572","Irish","Israeli Jews","Ivantan","Kenyan 142","Kenyan Highlander","Kenyan Lowlander","Kimberley","Ko rean 200","Kurdish","Malay","Ma ndenka","M etalsa","M exican","M innan","No rth America (Af)","North America (As)","North America (Eu)","North American (Hi)","Okinawan","Omani"," Paiwan 51","Pazeh","Puyuma 49","Rukai","Ryukuan","Sai siat","Ser i","Shona" ,"Singapor e","Siraya ","Thai"," Thao","Tor oko","Tsou ","Tuva"," Ugandan"," Yami","Yue ndumu","Yu pik","Zamb ian","Zulu ", NULL};
bool bFound =false;
while(!bFound){
cout<<"What is the name of the population you want to search for?"<<endl;
cout<<"The search is case sensitive and you have to press enter twice."<<endl;
getline(cin, sPop, '\n');
for (int i=0; apszPopulations[i] !=NULL; ++i){
if (!sPop.compare(apszPopulat ions[i])){ bFound = true; break;}
}
}
return sPop;
}
void Process ( string sPopulation, char* pszFastaFile, char* pszResult) {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
ExtractHaplotypesForPopula tion ( "mhc.csv", sPopulation, lstTypes);
ReadProteins ( pszFastaFile, mapProteins);
AssociateProteinsWithHaplo types ( pszResult, sPop, lstTypes, mapProteins);
}
void ExtractHaplotypesForPopula tion(const char* pszHaplotypeFileName, const string& sPop, list<string>& lstResult) {
static int anFields[] = { 3,4,6,7,9,10,12,13,15,16,1 8,19, -1}; // TEST - remove the 1st -1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file " << pszHaplotypeFileName << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
// cout << "RAW " << sLine << endl;
string sField = ExtractField (3,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i] + 2,sLine);
// cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField );
}
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
// cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
nPos = strLine.find(' ');
if ( -1 == nPos) continue;
strType = strLine.substr(nPos + 1);
// cout << " found " << strType << endl;
getline(is,strData);
while ( '>' != (char) is.peek() && !is.eof()) {
getline(is,strLine);
// cout << " data " << strLine << endl;
strData += strLine;
}
// cout << " adding " << strData << endl;;
mapProteins.insert(map<str ing,string >::value_t ype(strTyp e,strData) );
strData.erase(); // start out with a fresh buffer
}
}
}
void AssociateProteinsWithHaplo types(cons t char* pszFileName, string sPop, list<string>& lstTypes, map<string,string>& mapProteins) {
ofstream os (pszFileName);
string strLine;
string strData;
string strType;
if (!os.is_open()) {
cout << "Could not open output file" << endl;
return;
}
os << sPop << endl;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
cout << " looking up " << *il << endl;
map<string,string>::iterat or im = mapProteins.find(*il);
if ( mapProteins.end() != im) {
os << ">" << im->first << endl << im->second << endl;
cout << " found entry: " << im->first << endl;
}
}
map<string,string>::iterat or im;
cout << " We searched the following entries: " << endl;
for (im = mapProteins.begin(); mapProteins.end() != im; ++im)
cout << " " << im->first << endl;
}
int main () {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
cout<<"Available Populations:"<<endl;
cout<<" "<<endl;
cout<<"Algerian 99, American Samoa, AmerIndian, Ami 97"<<endl;
cout<<"Arab Durze, Atayal, Bari, Brazilian"<<endl;
cout<<"Brazilian (Af Eu), Bulgarian, Bunun, Burait"<<endl;
cout<<"Cape York, Chaouya, Croatian, Cuban (Af Eu)"<<endl;
cout<<"Cuban (Eu), Czech, Doggon, Filipino"<<endl;
cout<<"Finn 90, Georgian, Groote Eylandt, Guarani-Kaiowa"<<endl;
cout<<"Guarani-Nandewa, Hakka, Han-Chinese 149 Han Chinese 572"<<endl;
cout<<"Irish, Israeli Jews, Ivantan, Kenyan 142"<<endl;
cout<<"Kenyan Highlander, Kenyan Lowlander, Kimberley, Korean 200"<<endl;
cout<<"Kurdish, Malay, Mandenka, Metalsa"<<endl;
cout<<"Mexican Minnan North America (Af) North America (As)"<<endl;
cout<<"North America (Eu), North American (Hi), Okinawan, Omani"<<endl;
cout<<"Paiwan 51, Pazeh, Puyuma 49, Rukai"<<endl;
cout<<"Ryukuan, Saisiat, Seri, Shona"<<endl;
cout<<"Singapore, Siraya, Thai, Thao"<<endl;
cout<<"Toroko, Tsou, Tuva, Ugandan"<<endl;
cout<<"Yami, Yuendumu, Yupik, Zambian"<<endl;
cout<<"Zulu"<<endl;
int choice;
cout<<"Below are your menu choices:"<<endl;
cout<<"[1] Show individual populations with frequency"<<endl;
cout<<"[2] Show individual populations without the frequency"<<endl;
cout<<"[3] Print out all populations with their alleles"<<endl;
cin>>choice;
switch(choice)
{
case 1:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
ReadProteins ( "hla_nuc.fasta", mapProteins);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
}
case 2:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
RemoveDuplicates (1stTypes);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
}
case 3:
{
}
default:
cout << "Invalid Menu choice. Please make another selection." << endl;
system("cls");
int main();
return 0;
}
}
#include <fstream>
#include <string>
#include <list>
#include <map>
using namespace std;
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
string sField = "";
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return sField; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
void RemoveDuplicates (list<string>& lstTypes){
map<string,string> tmp;
map<string,string>::iterat
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
tmp.insert(map<string,stri
}
lstTypes.erase();
for (im = tmp.begin(); tmp.end() != im; ++im) lstTypes.push_back(im->fir
}
string GetPopulation(string& sPop)
{
char* apszPopulations[] = {"Algerian 99","American Samoa","AmerIndian","Ami 97","Arab Durze","Atayal","Bari","Br
bool bFound =false;
while(!bFound){
cout<<"What is the name of the population you want to search for?"<<endl;
cout<<"The search is case sensitive and you have to press enter twice."<<endl;
getline(cin, sPop, '\n');
for (int i=0; apszPopulations[i] !=NULL; ++i){
if (!sPop.compare(apszPopulat
}
}
return sPop;
}
void Process ( string sPopulation, char* pszFastaFile, char* pszResult) {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
ExtractHaplotypesForPopula
ReadProteins ( pszFastaFile, mapProteins);
AssociateProteinsWithHaplo
}
void ExtractHaplotypesForPopula
static int anFields[] = { 3,4,6,7,9,10,12,13,15,16,1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file " << pszHaplotypeFileName << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
// cout << "RAW " << sLine << endl;
string sField = ExtractField (3,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i] + 2,sLine);
// cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField
}
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
// cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
nPos = strLine.find(' ');
if ( -1 == nPos) continue;
strType = strLine.substr(nPos + 1);
// cout << " found " << strType << endl;
getline(is,strData);
while ( '>' != (char) is.peek() && !is.eof()) {
getline(is,strLine);
// cout << " data " << strLine << endl;
strData += strLine;
}
// cout << " adding " << strData << endl;;
mapProteins.insert(map<str
strData.erase(); // start out with a fresh buffer
}
}
}
void AssociateProteinsWithHaplo
ofstream os (pszFileName);
string strLine;
string strData;
string strType;
if (!os.is_open()) {
cout << "Could not open output file" << endl;
return;
}
os << sPop << endl;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
cout << " looking up " << *il << endl;
map<string,string>::iterat
if ( mapProteins.end() != im) {
os << ">" << im->first << endl << im->second << endl;
cout << " found entry: " << im->first << endl;
}
}
map<string,string>::iterat
cout << " We searched the following entries: " << endl;
for (im = mapProteins.begin(); mapProteins.end() != im; ++im)
cout << " " << im->first << endl;
}
int main () {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
cout<<"Available Populations:"<<endl;
cout<<" "<<endl;
cout<<"Algerian 99, American Samoa, AmerIndian, Ami 97"<<endl;
cout<<"Arab Durze, Atayal, Bari, Brazilian"<<endl;
cout<<"Brazilian (Af Eu), Bulgarian, Bunun, Burait"<<endl;
cout<<"Cape York, Chaouya, Croatian, Cuban (Af Eu)"<<endl;
cout<<"Cuban (Eu), Czech, Doggon, Filipino"<<endl;
cout<<"Finn 90, Georgian, Groote Eylandt, Guarani-Kaiowa"<<endl;
cout<<"Guarani-Nandewa, Hakka, Han-Chinese 149 Han Chinese 572"<<endl;
cout<<"Irish, Israeli Jews, Ivantan, Kenyan 142"<<endl;
cout<<"Kenyan Highlander, Kenyan Lowlander, Kimberley, Korean 200"<<endl;
cout<<"Kurdish, Malay, Mandenka, Metalsa"<<endl;
cout<<"Mexican Minnan North America (Af) North America (As)"<<endl;
cout<<"North America (Eu), North American (Hi), Okinawan, Omani"<<endl;
cout<<"Paiwan 51, Pazeh, Puyuma 49, Rukai"<<endl;
cout<<"Ryukuan, Saisiat, Seri, Shona"<<endl;
cout<<"Singapore, Siraya, Thai, Thao"<<endl;
cout<<"Toroko, Tsou, Tuva, Ugandan"<<endl;
cout<<"Yami, Yuendumu, Yupik, Zambian"<<endl;
cout<<"Zulu"<<endl;
int choice;
cout<<"Below are your menu choices:"<<endl;
cout<<"[1] Show individual populations with frequency"<<endl;
cout<<"[2] Show individual populations without the frequency"<<endl;
cout<<"[3] Print out all populations with their alleles"<<endl;
cin>>choice;
switch(choice)
{
case 1:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula
ReadProteins ( "hla_nuc.fasta", mapProteins);
AssociateProteinsWithHaplo
}
case 2:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula
RemoveDuplicates (1stTypes);
AssociateProteinsWithHaplo
}
case 3:
{
}
default:
cout << "Invalid Menu choice. Please make another selection." << endl;
system("cls");
int main();
return 0;
}
}
Well, one error on my side,
lstTypes.erase();
should have been
lstTypes.clear();
Then, you were missing the prototypes and there was also one typo ('1stTypes' instead of 'lstTypes'). The following compiles:
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <map>
using namespace std;
void ExtractHaplotypesForPopula tion(const char* pszHaplotypeFileName, const string& sPop, list<string>& lstResult);
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins);
void AssociateProteinsWithHaplo types(cons t char* pszFileName, string sPop, list<string>& lstTypes, map<string,string>& mapProteins);
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
string sField = "";
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return sField; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
void RemoveDuplicates (list<string>& lstTypes){
map<string,string> tmp;
map<string,string>::iterat or im;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
tmp.insert(map<string,stri ng>::value _type(*il, *il));
}
lstTypes.clear();
for (im = tmp.begin(); tmp.end() != im; ++im) lstTypes.push_back(im->fir st);
}
string GetPopulation(string& sPop)
{
char* apszPopulations[] = {"Algerian 99","American Samoa","AmerIndian","Ami 97","Arab Durze","Atayal","Bari","Br azilian"," Brazilian (Af Eu)","Bulgarian","Bunun"," Burait","C ape York","Chaouya","Croatian" ,"Cuban (Af Eu)","Cuban (Eu)","Czech","Doggon","Fi lipino","F inn 90","Georgian","Groote Eylandt","Guarani-Kaiowa", "Guarani-N andewa","H akka","Han -Chinese 149","Han Chinese 572","Irish","Israeli Jews","Ivantan","Kenyan 142","Kenyan Highlander","Kenyan Lowlander","Kimberley","Ko rean 200","Kurdish","Malay","Ma ndenka","M etalsa","M exican","M innan","No rth America (Af)","North America (As)","North America (Eu)","North American (Hi)","Okinawan","Omani"," Paiwan 51","Pazeh","Puyuma 49","Rukai","Ryukuan","Sai siat","Ser i","Shona" ,"Singapor e","Siraya ","Thai"," Thao","Tor oko","Tsou ","Tuva"," Ugandan"," Yami","Yue ndumu","Yu pik","Zamb ian","Zulu ", NULL};
bool bFound =false;
while(!bFound){
cout<<"What is the name of the population you want to search for?"<<endl;
cout<<"The search is case sensitive and you have to press enter twice."<<endl;
getline(cin, sPop, '\n');
for (int i=0; apszPopulations[i] !=NULL; ++i){
if (!sPop.compare(apszPopulat ions[i])){ bFound = true; break;}
}
}
return sPop;
}
void Process ( string sPopulation, char* pszFastaFile, char* pszResult) {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
ExtractHaplotypesForPopula tion ( "mhc.csv", sPopulation, lstTypes);
ReadProteins ( pszFastaFile, mapProteins);
AssociateProteinsWithHaplo types ( pszResult, sPop, lstTypes, mapProteins);
}
void ExtractHaplotypesForPopula tion(const char* pszHaplotypeFileName, const string& sPop, list<string>& lstResult) {
static int anFields[] = { 3,4,6,7,9,10,12,13,15,16,1 8,19, -1}; // TEST - remove the 1st -1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file " << pszHaplotypeFileName << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
// cout << "RAW " << sLine << endl;
string sField = ExtractField (3,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i] + 2,sLine);
// cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField );
}
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
// cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
nPos = strLine.find(' ');
if ( -1 == nPos) continue;
strType = strLine.substr(nPos + 1);
// cout << " found " << strType << endl;
getline(is,strData);
while ( '>' != (char) is.peek() && !is.eof()) {
getline(is,strLine);
// cout << " data " << strLine << endl;
strData += strLine;
}
// cout << " adding " << strData << endl;;
mapProteins.insert(map<str ing,string >::value_t ype(strTyp e,strData) );
strData.erase(); // start out with a fresh buffer
}
}
}
void AssociateProteinsWithHaplo types(cons t char* pszFileName, string sPop, list<string>& lstTypes, map<string,string>& mapProteins) {
ofstream os (pszFileName);
string strLine;
string strData;
string strType;
if (!os.is_open()) {
cout << "Could not open output file" << endl;
return;
}
os << sPop << endl;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
cout << " looking up " << *il << endl;
map<string,string>::iterat or im = mapProteins.find(*il);
if ( mapProteins.end() != im) {
os << ">" << im->first << endl << im->second << endl;
cout << " found entry: " << im->first << endl;
}
}
map<string,string>::iterat or im;
cout << " We searched the following entries: " << endl;
for (im = mapProteins.begin(); mapProteins.end() != im; ++im)
cout << " " << im->first << endl;
}
int main () {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
cout<<"Available Populations:"<<endl;
cout<<" "<<endl;
cout<<"Algerian 99, American Samoa, AmerIndian, Ami 97"<<endl;
cout<<"Arab Durze, Atayal, Bari, Brazilian"<<endl;
cout<<"Brazilian (Af Eu), Bulgarian, Bunun, Burait"<<endl;
cout<<"Cape York, Chaouya, Croatian, Cuban (Af Eu)"<<endl;
cout<<"Cuban (Eu), Czech, Doggon, Filipino"<<endl;
cout<<"Finn 90, Georgian, Groote Eylandt, Guarani-Kaiowa"<<endl;
cout<<"Guarani-Nandewa, Hakka, Han-Chinese 149 Han Chinese 572"<<endl;
cout<<"Irish, Israeli Jews, Ivantan, Kenyan 142"<<endl;
cout<<"Kenyan Highlander, Kenyan Lowlander, Kimberley, Korean 200"<<endl;
cout<<"Kurdish, Malay, Mandenka, Metalsa"<<endl;
cout<<"Mexican Minnan North America (Af) North America (As)"<<endl;
cout<<"North America (Eu), North American (Hi), Okinawan, Omani"<<endl;
cout<<"Paiwan 51, Pazeh, Puyuma 49, Rukai"<<endl;
cout<<"Ryukuan, Saisiat, Seri, Shona"<<endl;
cout<<"Singapore, Siraya, Thai, Thao"<<endl;
cout<<"Toroko, Tsou, Tuva, Ugandan"<<endl;
cout<<"Yami, Yuendumu, Yupik, Zambian"<<endl;
cout<<"Zulu"<<endl;
int choice;
cout<<"Below are your menu choices:"<<endl;
cout<<"[1] Show individual populations with frequency"<<endl;
cout<<"[2] Show individual populations without the frequency"<<endl;
cout<<"[3] Print out all populations with their alleles"<<endl;
cin>>choice;
switch(choice)
{
case 1:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
ReadProteins ( "hla_nuc.fasta", mapProteins);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
}
case 2:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
RemoveDuplicates (lstTypes);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
}
case 3:
{
}
default:
cout << "Invalid Menu choice. Please make another selection." << endl;
system("cls");
int main();
return 0;
}
}
lstTypes.erase();
should have been
lstTypes.clear();
Then, you were missing the prototypes and there was also one typo ('1stTypes' instead of 'lstTypes'). The following compiles:
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#include <map>
using namespace std;
void ExtractHaplotypesForPopula
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins);
void AssociateProteinsWithHaplo
string ExtractField ( unsigned int unRow, const string& sLine) {
// find 'unRow' column
string sField = "";
int nPos = 0;
int nCount = 0;
while ( nCount < (unRow - 1)) { // 'unRow row' means 'unRow - 1 commas'
nPos = sLine.find(',',nPos);
if ( -1 == nPos) return sField; // assume 'no match' if there is no such column
++nPos; ++nCount;
}
// find next delimiting comma
if ( nPos >= sLine.size()) return false;
int nEnd = sLine.find(',',nPos);
if ( -1 == nEnd) return false; // assume 'no match' if there is no following comma
sField = sLine.substr(nPos, nEnd - nPos);
return sField;
}
void RemoveDuplicates (list<string>& lstTypes){
map<string,string> tmp;
map<string,string>::iterat
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
tmp.insert(map<string,stri
}
lstTypes.clear();
for (im = tmp.begin(); tmp.end() != im; ++im) lstTypes.push_back(im->fir
}
string GetPopulation(string& sPop)
{
char* apszPopulations[] = {"Algerian 99","American Samoa","AmerIndian","Ami 97","Arab Durze","Atayal","Bari","Br
bool bFound =false;
while(!bFound){
cout<<"What is the name of the population you want to search for?"<<endl;
cout<<"The search is case sensitive and you have to press enter twice."<<endl;
getline(cin, sPop, '\n');
for (int i=0; apszPopulations[i] !=NULL; ++i){
if (!sPop.compare(apszPopulat
}
}
return sPop;
}
void Process ( string sPopulation, char* pszFastaFile, char* pszResult) {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
ExtractHaplotypesForPopula
ReadProteins ( pszFastaFile, mapProteins);
AssociateProteinsWithHaplo
}
void ExtractHaplotypesForPopula
static int anFields[] = { 3,4,6,7,9,10,12,13,15,16,1
ifstream is (pszHaplotypeFileName);
string sLine;
if (!is.is_open()) {
cout << "Could not open input file " << pszHaplotypeFileName << endl;
return;
}
while (!is.eof()) {
getline(is,sLine);
// cout << "RAW " << sLine << endl;
string sField = ExtractField (3,sLine);
if (sField == sPop) {
for ( int i = 0; anFields[i] > 0; ++i) {
sField = ExtractField (anFields[i] + 2,sLine);
// cout << " found " << sField << endl;
while ( ' ' == sField[0]) sField.erase(0,1); // remove whitespace
if (sField.size()) lstResult.push_back(sField
}
}
}
}
void ReadProteins(const char* pszProteinFileName, map<string,string>& mapProteins) {
ifstream is (pszProteinFileName);
string strLine;
string strData;
string strType;
if (!is.is_open()) {
cout << "Could not open input file" << endl;
return;
}
while (!is.eof()) {
int nPos;
getline(is,strLine);
// cout << "RAW " << strLine << endl;
if ( strLine[0] == '>') {
nPos = strLine.find(' ');
if ( -1 == nPos) continue;
strType = strLine.substr(nPos + 1);
// cout << " found " << strType << endl;
getline(is,strData);
while ( '>' != (char) is.peek() && !is.eof()) {
getline(is,strLine);
// cout << " data " << strLine << endl;
strData += strLine;
}
// cout << " adding " << strData << endl;;
mapProteins.insert(map<str
strData.erase(); // start out with a fresh buffer
}
}
}
void AssociateProteinsWithHaplo
ofstream os (pszFileName);
string strLine;
string strData;
string strType;
if (!os.is_open()) {
cout << "Could not open output file" << endl;
return;
}
os << sPop << endl;
list<string>::iterator il;
for ( il = lstTypes.begin(); il != lstTypes.end(); ++il) {
cout << " looking up " << *il << endl;
map<string,string>::iterat
if ( mapProteins.end() != im) {
os << ">" << im->first << endl << im->second << endl;
cout << " found entry: " << im->first << endl;
}
}
map<string,string>::iterat
cout << " We searched the following entries: " << endl;
for (im = mapProteins.begin(); mapProteins.end() != im; ++im)
cout << " " << im->first << endl;
}
int main () {
string sPop;
list<string> lstTypes;
map<string,string> mapProteins;
cout<<"Available Populations:"<<endl;
cout<<" "<<endl;
cout<<"Algerian 99, American Samoa, AmerIndian, Ami 97"<<endl;
cout<<"Arab Durze, Atayal, Bari, Brazilian"<<endl;
cout<<"Brazilian (Af Eu), Bulgarian, Bunun, Burait"<<endl;
cout<<"Cape York, Chaouya, Croatian, Cuban (Af Eu)"<<endl;
cout<<"Cuban (Eu), Czech, Doggon, Filipino"<<endl;
cout<<"Finn 90, Georgian, Groote Eylandt, Guarani-Kaiowa"<<endl;
cout<<"Guarani-Nandewa, Hakka, Han-Chinese 149 Han Chinese 572"<<endl;
cout<<"Irish, Israeli Jews, Ivantan, Kenyan 142"<<endl;
cout<<"Kenyan Highlander, Kenyan Lowlander, Kimberley, Korean 200"<<endl;
cout<<"Kurdish, Malay, Mandenka, Metalsa"<<endl;
cout<<"Mexican Minnan North America (Af) North America (As)"<<endl;
cout<<"North America (Eu), North American (Hi), Okinawan, Omani"<<endl;
cout<<"Paiwan 51, Pazeh, Puyuma 49, Rukai"<<endl;
cout<<"Ryukuan, Saisiat, Seri, Shona"<<endl;
cout<<"Singapore, Siraya, Thai, Thao"<<endl;
cout<<"Toroko, Tsou, Tuva, Ugandan"<<endl;
cout<<"Yami, Yuendumu, Yupik, Zambian"<<endl;
cout<<"Zulu"<<endl;
int choice;
cout<<"Below are your menu choices:"<<endl;
cout<<"[1] Show individual populations with frequency"<<endl;
cout<<"[2] Show individual populations without the frequency"<<endl;
cout<<"[3] Print out all populations with their alleles"<<endl;
cin>>choice;
switch(choice)
{
case 1:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula
ReadProteins ( "hla_nuc.fasta", mapProteins);
AssociateProteinsWithHaplo
}
case 2:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula
RemoveDuplicates (lstTypes);
AssociateProteinsWithHaplo
}
case 3:
{
}
default:
cout << "Invalid Menu choice. Please make another selection." << endl;
system("cls");
int main();
return 0;
}
}
ASKER
witht that code i get a weird run with my getpopulation function. it outputs the lines asking for the population twice and i think this is i get no output
So what is it that you expect as opposed to what you get?
HINT: If you merely say "It doesn't work", you are not really giving away a lot of useful information.
HINT: If you merely say "It doesn't work", you are not really giving away a lot of useful information.
ASKER
im expecting that in the result file i should have a fileall the matches appearing once, instead of duplicates like we get now
ASKER
Also with the mass produced version, can each population display its matches in a seperate file with that population named....so zulu would have its matches, finn 90 would have its matches, etc etc
ASKER
sorry i got the code to work for option two, however, it deletes any reoccurences instead of leaving one copy
ASKER
I think wat is wrong wit the second option it just looks for anything that appears twice, and deletes all occurences of it. ANd i cant figure it out enough to get it to only delete reoccurences, not the first copy
ASKER
jkr...i really need ur help...where did u go?
Well, I have some work to do also. What compiler are you using at the moment? (I am asking because the issue "weird run with my getpopulation function seems to be related to a bug in VC6' STL")
ASKER
it is being run in microsofI ma using microsoft visiual studio c++, if you need more information about the compiler just let me know
Tr<
string GetPopulation(string& sPop)
{
char* apszPopulations[] = {"Algerian 99","American Samoa","AmerIndian","Ami 97","Arab Durze","Atayal","Bari","Br azilian"," Brazilian (Af Eu)","Bulgarian","Bunun"," Burait","C ape York","Chaouya","Croatian" ,"Cuban (Af Eu)","Cuban (Eu)","Czech","Doggon","Fi lipino","F inn 90","Georgian","Groote Eylandt","Guarani-Kaiowa", "Guarani-N andewa","H akka","Han -Chinese 149","Han Chinese 572","Irish","Israeli Jews","Ivantan","Kenyan 142","Kenyan Highlander","Kenyan Lowlander","Kimberley","Ko rean 200","Kurdish","Malay","Ma ndenka","M etalsa","M exican","M innan","No rth America (Af)","North America (As)","North America (Eu)","North American (Hi)","Okinawan","Omani"," Paiwan 51","Pazeh","Puyuma 49","Rukai","Ryukuan","Sai siat","Ser i","Shona" ,"Singapor e","Siraya ","Thai"," Thao","Tor oko","Tsou ","Tuva"," Ugandan"," Yami","Yue ndumu","Yu pik","Zamb ian","Zulu ", NULL};
bool bFound =false;
while(!bFound){
cout<<"What is the name of the population you want to search for?"<<endl;
cout<<"The search is case sensitive and you have to press enter twice."<<endl;
cin >> sPop;
for (int i=0; apszPopulations[i] !=NULL; ++i){
if (!sPop.compare(apszPopulat ions[i])){ bFound = true; break;}
}
}
return sPop;
}
instead, that should also work with VC6. (cross-checked it)
string GetPopulation(string& sPop)
{
char* apszPopulations[] = {"Algerian 99","American Samoa","AmerIndian","Ami 97","Arab Durze","Atayal","Bari","Br
bool bFound =false;
while(!bFound){
cout<<"What is the name of the population you want to search for?"<<endl;
cout<<"The search is case sensitive and you have to press enter twice."<<endl;
cin >> sPop;
for (int i=0; apszPopulations[i] !=NULL; ++i){
if (!sPop.compare(apszPopulat
}
}
return sPop;
}
instead, that should also work with VC6. (cross-checked it)
ASKER
That fixed one problem, however, with option 2 i am still not getin the right output. It wont show the outputs of matches that come up more than once. meaning that your code instead of leaving just one occurence in the output file, deleted all occurences of it
You need to put a 'break;' after each case, e.g.
switch(choice)
{
case 1:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
ReadProteins ( "hla_nuc.fasta", mapProteins);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
break; // <-------
}
case 2:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
RemoveDuplicates (lstTypes);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
break; // <-------
}
case 3:
{
}
default:
cout << "Invalid Menu choice. Please make another selection." << endl;
//system("cls");
int main(); // <-------------------What's that for?
return 0;
}
switch(choice)
{
case 1:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula
ReadProteins ( "hla_nuc.fasta", mapProteins);
AssociateProteinsWithHaplo
break; // <-------
}
case 2:
{
GetPopulation(sPop);
ExtractHaplotypesForPopula
RemoveDuplicates (lstTypes);
AssociateProteinsWithHaplo
break; // <-------
}
case 3:
{
}
default:
cout << "Invalid Menu choice. Please make another selection." << endl;
//system("cls");
int main(); // <-------------------What's
return 0;
}
ASKER
i had the breaks in already adn the int main is so that the program will restart if the user puts the wrong input in.
do you have any thoughts as far as why the remove duplicates function takes away all the occurences of matches that have doubles?
do you have any thoughts as far as why the remove duplicates function takes away all the occurences of matches that have doubles?
Err, yes, because it is designed that way. That's what you asked for. It does not matter if you remove duplicates before finding the matches or afterwards - it might speed up the search, but the result will be the same.
ASKER
well what im sayin is i would like to have at one cop of the duplicates in the output file
ASKER
for example a sample output of option one is
>blah A*0101
dfdfdsfdsf
>blah A*0101
dfdfdsfdsf
well in option two i dont wnat to delete both occurences of the matches, rather, just make it so that it shows one match so it would be
>blah A*0101
dfdfdsfdsf
>blah A*0101
dfdfdsfdsf
>blah A*0101
dfdfdsfdsf
well in option two i dont wnat to delete both occurences of the matches, rather, just make it so that it shows one match so it would be
>blah A*0101
dfdfdsfdsf
Hm, IMO that's what it does. However, you forgot to read the proteins in 'case 2':
GetPopulation(sPop);
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
RemoveDuplicates (lstTypes);
ReadProteins ( "hla_nuc.fasta", mapProteins); // was missing
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
break; // <-------
If you are going to use all that in the same 'main()', be also sure to clear the containers, e.g.
lstTypes.clear();
MapProteins.clear();
GetPopulation(sPop);
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
RemoveDuplicates (lstTypes);
ReadProteins ( "hla_nuc.fasta.txt", mapProteins); // was missing
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
break; // <-------
GetPopulation(sPop);
ExtractHaplotypesForPopula
RemoveDuplicates (lstTypes);
ReadProteins ( "hla_nuc.fasta", mapProteins); // was missing
AssociateProteinsWithHaplo
break; // <-------
If you are going to use all that in the same 'main()', be also sure to clear the containers, e.g.
lstTypes.clear();
MapProteins.clear();
GetPopulation(sPop);
ExtractHaplotypesForPopula
RemoveDuplicates (lstTypes);
ReadProteins ( "hla_nuc.fasta.txt", mapProteins); // was missing
AssociateProteinsWithHaplo
break; // <-------
ASKER
i have that too in my code, I guess when i cut and pasted, I did something wrong
Well, then it should work - at least here it does:
List before: DRB1*1401
List before: DRB1*1501
List before: DQA1*0102
List before: DQA1*0102
List before: DQB1*0602
List before: DQB1*0602
List before: A*0301
List before: A*300101
List before: B*1503
List before: B*4201
List before: Cw*020201
List before: Cw*1701
List after: A*0301
List after: A*300101
List after: B*1503
List after: B*4201
List after: Cw*020201
List after: Cw*1701
List after: DQA1*0102
List after: DQB1*0602
List after: DRB1*1401
List after: DRB1*1501
looking up A*0301
looking up A*300101
looking up B*1503
looking up B*4201
looking up Cw*020201
looking up Cw*1701
looking up DQA1*0102
looking up DQB1*0602
looking up DRB1*1401
looking up DRB1*1501
We searched the following entries
A*010102
A*010103
A*0102
A*0103
A*0104N
A*0106
Are you sure you are using the right file when compiling?
List before: DRB1*1401
List before: DRB1*1501
List before: DQA1*0102
List before: DQA1*0102
List before: DQB1*0602
List before: DQB1*0602
List before: A*0301
List before: A*300101
List before: B*1503
List before: B*4201
List before: Cw*020201
List before: Cw*1701
List after: A*0301
List after: A*300101
List after: B*1503
List after: B*4201
List after: Cw*020201
List after: Cw*1701
List after: DQA1*0102
List after: DQB1*0602
List after: DRB1*1401
List after: DRB1*1501
looking up A*0301
looking up A*300101
looking up B*1503
looking up B*4201
looking up Cw*020201
looking up Cw*1701
looking up DQA1*0102
looking up DQB1*0602
looking up DRB1*1401
looking up DRB1*1501
We searched the following entries
A*010102
A*010103
A*0102
A*0103
A*0104N
A*0106
Are you sure you are using the right file when compiling?
ASKER
I mean its givine me output dont get me wrong, but the peoblem is it takes out all ocurences of anything is a double. Like make a test file with nothin but duplicates and Im sure once you run the program you wil get an empty output file.
Also did you see my notes earlier about the mass produce and how I would like the matches for each population to be printed into a seperate file named appropiately
Also did you see my notes earlier about the mass produce and how I would like the matches for each population to be printed into a seperate file named appropiately
>> but the peoblem is it takes out all ocurences of anything is a double
No. As you can see above:
List before: DQB1*0602
List before: DQB1*0602
vs.
List after: DQB1*0602
>>Also did you see my notes earlier about the mass produce and how I would like the matches for each population to be printed into a seperate file named
>>appropiately
Yes, but I wonder how that is related to the original question, which was "I would like to create a piece of code that would work with the rest of the program I have been developing." Now we're at a stage that to me resembles "I want you to add a couple of more features that also came into my mind"
No. As you can see above:
List before: DQB1*0602
List before: DQB1*0602
vs.
List after: DQB1*0602
>>Also did you see my notes earlier about the mass produce and how I would like the matches for each population to be printed into a seperate file named
>>appropiately
Yes, but I wonder how that is related to the original question, which was "I would like to create a piece of code that would work with the rest of the program I have been developing." Now we're at a stage that to me resembles "I want you to add a couple of more features that also came into my mind"
ASKER
well would you like for me to open a new post so that more points could be awarded?
As soon as we got the basic issue fixed, yes (also because this page becomes a pain to load with ~200kb now). Have you verified that removing the duplicates wors (at least the way you described the functionality)?. Oh, and one basic point before I forget about that, regarding your menu
while (1)
{
switch(choice)
{
case 1:
{
lstTypes.clear();
mapProteins.clear();
GetPopulation(sPop);
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
ReadProteins ( "hla_nuc.fasta.txt", mapProteins);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
break; // <-------
}
case 2:
{
lstTypes.clear();
mapProteins.clear();
GetPopulation(sPop);
ExtractHaplotypesForPopula tion ( "mhc.csv", sPop, lstTypes);
RemoveDuplicates (lstTypes);
ReadProteins ( "hla_nuc.fasta.txt", mapProteins);
AssociateProteinsWithHaplo types ( "result.txt", sPop, lstTypes, mapProteins);
break; // <-------
}
case 3:
{
}
case 4: // exit loop
{
exit(0);
}
default:
cout << "Invalid Menu choice. Please make another selection." << endl;
}
}
while (1)
{
switch(choice)
{
case 1:
{
lstTypes.clear();
mapProteins.clear();
GetPopulation(sPop);
ExtractHaplotypesForPopula
ReadProteins ( "hla_nuc.fasta.txt", mapProteins);
AssociateProteinsWithHaplo
break; // <-------
}
case 2:
{
lstTypes.clear();
mapProteins.clear();
GetPopulation(sPop);
ExtractHaplotypesForPopula
RemoveDuplicates (lstTypes);
ReadProteins ( "hla_nuc.fasta.txt", mapProteins);
AssociateProteinsWithHaplo
break; // <-------
}
case 3:
{
}
case 4: // exit loop
{
exit(0);
}
default:
cout << "Invalid Menu choice. Please make another selection." << endl;
}
}
ASKER
that still does not work....and I relaly dont knwo how to explain it differently than how i have before, maybe its the numbering because the one im looking for is A*0101 and the lowest match it picks up is A*0202? I have no idea
ASKER
owever, it is picking up at least one occurence
Well, it works in terns of eliminating duplicate entries previously extracted from the .csv file. If there are still no results when matching the entries against the other file, removing teh duplicates certainly isn't responsible for that. Compare the results when you run it with and without removing the duplicates.
ASKER
Your right its on niether, so i dont know what to do and I dont know why its not picking it up
ASKER
Ok well I think i figured out what the problem was, it was something on my end. So I will have to take care of that. I will award the points and move on for my next question
Thanx :o)