I am working on a class that I will be using in a much larger program on linux. I made the class in DevC++ using the MINGW port of the gcc compiler. The class and test program compile and run fine under windows with mingw, but when I comple it with G++ under linux amd run it I get a segfault. I have never debuged a program under linux so I don't really know where the problem is, but I think it has something to do with malloc(),strcpy(),or strcat() working differently in linux. So can anyone debug and fix what is neccassary in the program below and maybe tell me what caused it. The purpos of the class is to parse/extract/manipulate data from html files so the I can insert the data into a database.
#include <iostream.h>
#include <stdlib.h>
#include <alloc.h>
#include <string.h>
class page_type{
char* html;
char* title;
char* body;
char* plain;
void set_title(void);
void set_body(void);
char* strip_tags(char*);
void html_tolower(void);
public:
char* str_replace_once(char*,cha
r*,char*);
char* str_to_lower(char*);
char* trunc(char*,int);
char* remove_symbols(char*);
char* str_replace(char* ,char* , char*);
char* get_title(void);
char* get_body(void);
void init(char*);
};
char* page_type::str_to_lower(ch
ar* string)
{
char* temp;
temp = (char*)malloc(strlen(strin
g)+1);
memcpy(temp,string,strlen(
string));
temp = str_replace("A","a",temp);
temp = str_replace("B","b",temp);
temp = str_replace("C","c",temp);
temp = str_replace("D","d",temp);
temp = str_replace("E","e",temp);
temp = str_replace("F","f",temp);
temp = str_replace("G","g",temp);
temp = str_replace("H","h",temp);
temp = str_replace("I","i",temp);
temp = str_replace("J","j",temp);
temp = str_replace("K","k",temp);
temp = str_replace("L","l",temp);
temp = str_replace("M","m",temp);
temp = str_replace("N","n",temp);
temp = str_replace("O","o",temp);
temp = str_replace("P","p",temp);
temp = str_replace("Q","q",temp);
temp = str_replace("R","r",temp);
temp = str_replace("S","s",temp);
temp = str_replace("T","t",temp);
temp = str_replace("U","u",temp);
temp = str_replace("V","v",temp);
temp = str_replace("W","w",temp);
temp = str_replace("X","x",temp);
temp = str_replace("Y","y",temp);
temp = str_replace("Z","z",temp);
return(temp);
}
char* page_type::trunc(char* data,int len)
{
char* temp;
int size;
size = strlen(data);
if(size > len)
{
temp = (char*)malloc(len+10);
memcpy(temp,data,len);
return(temp);
}
else
{
return(data);
}
}
char* page_type::remove_symbols(
char* original)
{
char* tempstr;
tempstr = (char*)malloc(strlen(origi
nal)+1);
strcpy(tempstr,original);
tempstr = str_replace("!"," ",tempstr);
tempstr = str_replace("@"," ",tempstr);
tempstr = str_replace("#"," ",tempstr);
tempstr = str_replace("$"," ",tempstr);
tempstr = str_replace("%"," ",tempstr);
tempstr = str_replace("^"," ",tempstr);
tempstr = str_replace("&"," ",tempstr);
tempstr = str_replace("*"," ",tempstr);
tempstr = str_replace("("," ",tempstr);
tempstr = str_replace(")"," ",tempstr);
tempstr = str_replace("-"," ",tempstr);
tempstr = str_replace("_"," ",tempstr);
tempstr = str_replace("="," ",tempstr);
tempstr = str_replace("+"," ",tempstr);
tempstr = str_replace("\\"," ",tempstr);
tempstr = str_replace("|"," ",tempstr);
tempstr = str_replace("`"," ",tempstr);
tempstr = str_replace("~"," ",tempstr);
tempstr = str_replace("["," ",tempstr);
tempstr = str_replace("]"," ",tempstr);
tempstr = str_replace("{"," ",tempstr);
tempstr = str_replace("}"," ",tempstr);
tempstr = str_replace(";"," ",tempstr);
tempstr = str_replace(":"," ",tempstr);
tempstr = str_replace("'"," ",tempstr);
tempstr = str_replace(","," ",tempstr);
tempstr = str_replace("."," ",tempstr);
tempstr = str_replace("<"," ",tempstr);
tempstr = str_replace(">"," ",tempstr);
tempstr = str_replace("/"," ",tempstr);
tempstr = str_replace("?"," ",tempstr);
return(tempstr);
}
char* page_type::str_replace(cha
r* find, char* replace, char* string)
{
char* new_string;
char* test_var;
int state = 1;
new_string = (char*) malloc(strlen(string)+1);
strcpy(new_string,string);
while ( state == 1)
{
if(str_replace_once(find,r
eplace,new
_string) != NULL)
{
new_string = str_replace_once(find,repl
ace,new_st
ring);
}
else
{
state = 0;
}
}
return(new_string);
}
char* page_type::str_replace_onc
e(char* find, char* replace, char* string)
{
int find_len;
int replace_len;
int string_len;
int new_string_len;
char* new_string;
char* found_start;
char* found_end;
char* tempstring;
tempstring = (char*)malloc(strlen(strin
g)+1);
strcpy(tempstring,string);
find_len = strlen(find);
replace_len = strlen(replace);
string_len = strlen(tempstring);
new_string_len = ((string_len - find_len) + replace_len);
new_string = (char* ) malloc(new_string_len + 1);
if(strstr(tempstring,find)
!= NULL)
{
found_start = strstr(tempstring,find);
found_end = strstr(tempstring,find) + find_len;
memcpy(new_string,tempstri
ng,string_
len - strlen(found_start));
strcat(new_string,replace)
;
strcat(new_string,found_en
d);
return(new_string);
}
else
{
return(NULL);
}
}
void page_type::html_tolower(vo
id)
{
html = str_to_lower(html);
}
char* page_type::get_title(void)
{
if(title != NULL)
{
return(title);
}
else
{
return(NULL);
}
}
char* page_type::get_body(void)
{
if(body != NULL)
{
return(body);
}
else
{
return(NULL);
}
}
void page_type::set_title(void)
{
char* t_start;
char* t_end;
char* t_temp1;
char* t_temp2;
int size;
int t_size;
size = strlen(html);
t_temp1 = (char *)malloc(size + 2);
if(t_temp1 != NULL)
{
strcpy(t_temp1,html);
t_start = strstr(t_temp1,"<title>")+
7;
t_end = strstr(t_temp1,"</title>")
;
if((t_start != NULL) && (t_end != NULL))
{
t_size = t_end - t_start;
if(t_size > 0)
{
if(t_temp2 = (char*)malloc(t_size + 1))
{
memcpy(t_temp2,t_start,t_s
ize);
t_temp2 = remove_symbols(t_temp2);
if(title = (char*)malloc(strlen(t_tem
p2)+1))
{
strcpy(title,t_temp2);
}
else
{
cout << "Not enough mem for title" << endl;
}
}
else
{
cout << "not enough mem for t_temp2" << endl;
}
}
else
{
cout << "No title data" << endl;
}
}
else
{
cout << "No Title Found!" << endl;
}
}
else
{
cout << "Not enought Mem for t-temp1 in function set_title()" << endl;
}
}
void page_type::set_body(void)
{
char* b_start;
char* b_end;
char* b_temp1;
char* b_temp2;
char* b_temp3;
char* templen;
int size;
int b_size;
size = strlen(html);
b_temp1 = (char *)malloc(size+1);
if(b_temp1 != NULL)
{
strcpy(b_temp1,html);
templen = strstr(b_temp1,"<bod");
b_start = strstr(templen,">")+1;
b_end = strstr(b_temp1,"</bod");
if((b_start != NULL) && (b_end != NULL))
{
b_size = b_end - b_start;
b_temp2 = (char*)malloc(b_size + 1);
if(b_temp2 != NULL)
{
memcpy(b_temp2,b_start,b_s
ize);
b_temp3 = strip_tags(b_temp2);
b_temp3 = remove_symbols(b_temp3);
body = (char*) malloc(strlen(b_temp3));
if(body != NULL)
{
strcpy(body,b_temp3);
}
else
{
cout << "Not enough mem for body" << endl;
}
}
else
{
cout << "not enough room for b_temp2" << endl;
}
}
else
{
cout << "No body data" << endl;
}
}
else
{
cout << "not enough mem for b_temp1" << endl;
}
}
void page_type::init(char* html_data)
{
html = (char *)malloc(strlen(html_data)
+ 2);
if(html != NULL)
{
memcpy(html,html_data,(str
len(html_d
ata)+1));
html_tolower();
set_title();
set_body();
}
}
char* page_type::strip_tags(char
*data)
{
char* temp1;
char* temp2;
char* temp3;
char* start;
char* end;
int dist;
int size;
temp1 = (char*)malloc(strlen(data)
+1);
strcpy(temp1,data);
while((start != NULL) && (end != NULL))
{
size = strlen(temp1);
start = strstr(temp1,"<");
end = strstr(temp1,">")+1;
size = strlen(temp1);
if((start != NULL) && (end != NULL))
{
dist = size - strlen(start);
temp2 = (char *) malloc(dist + 2);
memcpy(temp2,temp1,dist);
temp3 = (char *) malloc(strlen(end)+1);
memcpy(temp3,end,strlen(en
d));
free(temp1);
temp1 = (char *)malloc((strlen(temp2)+(s
trlen(temp
3)+1)));
memcpy(temp1,temp2,strlen(
temp2));
strcat(temp1,temp3);
}
else
{
break;
}
}
return(temp1);
}
int main(int argc, char *argv[])
{
page_type *test;
page_type testx;
test = &testx;
//char* teststr = "Replace Bable fish";
//char* strtest;
test->init("<html><head><T
ITLE>Hello
World In order to test my trunc() function I need to make the title longer then 50 charactures I think I have it there now</TITLE></head><BODY bgcolor=\"red\"><b>This isn't a page;</b> about the \"hello world\" program! Instead it is a page that I am using o test the functionality of my new class wich I am going to use for the new Delta controls search endgine spider.</BODY></html>");
cout << test->trunc(test->get_titl
e(),50) << endl << endl;
cout << test->trunc(test->get_body
(),200) << endl << endl;
//cout << teststr << endl;
//strtest = test->str_replace_once("Ba
ble", "Gold", teststr);
//cout << strtest << endl;
system("PAUSE");
return 0;
}