Advertisement
Advertisement
| 04.10.2008 at 07:47PM PDT, ID: 23314014 |
|
[x]
Attachment Details
|
||
1: 2: 3: 4: 5: 6: 7: 8: 9: 10: 11: 12: 13: 14: 15: 16: 17: 18: 19: 20: 21: 22: 23: 24: 25: 26: 27: 28: 29: 30: 31: 32: 33: 34: 35: 36: 37: 38: 39: 40: 41: 42: 43: 44: 45: 46: 47: 48: 49: 50: 51: 52: 53: 54: 55: 56: 57: 58: 59: 60: 61: 62: 63: 64: 65: 66: 67: 68: 69: 70: 71: 72: 73: 74: 75: 76: 77: 78: 79: 80: 81: 82: 83: 84: 85: 86: 87: 88: 89: 90: 91: 92: 93: 94: 95: 96: 97: 98: 99: |
from sqlite3 import dbapi2 as sqlite
CSV_FILE = "C:/his/testdataFromGen8.csv"
DATABASE_FILE = "C:/his/test.db"
index_dict ={}
def write_to_csv(content_list):
f = open(CSV_FILE,"a")
f.write(",".join(content_list) + "\n")
f.flush()
f.close()
def get_csv_output(matrix,words,urls):
title = ["N/A"]
title.extend(words)
write_to_csv(title)
for url in urls:
line = [url]
for word in words:
line.append(get_amount(matrix,word,url));
print "finish make url : " + url
write_to_csv(line)
print "have done..."
def get_amount(matrix,word,url):
key = word + "&" + url
if index_dict.has_key(key):
return index_dict[key]
else:
return "0"
for r in matrix:
if url in r and word in r:
return str(r[0])
return "0";
def make_index_dict(rows):
key =""
value = ""
for row in rows:
key = row[1]+"&"+row[2]
index_dict[key] = str(row[0])
print "finish init index_dict..."
def start():
conn = sqlite.connect(DATABASE_FILE)
sql = '''SELECT count( wordlocation.wordid ) AS amount ,
wordlist.word ,
urllist.url
FROM wordlocation , wordlist , urllist
WHERE wordlist.rowid = wordlocation.wordid
AND urllist.rowid = wordlocation.urlid
GROUP BY urlid , wordid'''
cu = conn.cursor()
cu.execute(sql)
rs = cu.fetchall()
if not rs or len(rs)==0 :
print "no data found..."
urls = []
words = []
moreIgnoreWords = ['googl', 'blog', 'search', 't', 'link', 'activ', 'background', 'k', 'h', 'font', 'i', 'z', 'displai', 'none', 'div', 'n', 'margin', 'top', 'bold', 'q', 'b', 'ch', 'cursor', 'pointer', 'e', 'p', 'pr', 'ul', 'li', 'list', 'style', 'j', 'line', 'height', 'br', 'sbb', 'td', 'lrr', 'sop', 'gbar', 'float', 'gbh', 'border', 'solid', 'posit', 'absolut', 'gbi', 'fff', 'index', 'guser', 'import', 'media', 'all', 'right', 'vertic', 'align', 'block', 'text', 'decor', 'hover', 'bodi', 'famili', 'arial', 'san', 'serif', 'm', 'tpb', 'ttb', 'white', 'space', 'nowrap', 'rsb', 'btb', 'bt', 'ln', 'ccc', 'hd', 'eee', 's', 'f', 'fl', 'w', 'green', 'img', 'l', 'g', 'tr', 'asb', 'as', 'window', 'function', 'd', 'c', 'var', 'on', 'if', 'addeventlisten', 'fals', 'els', 'attachev', 'appli', 'thi', 'argument', 'return', 'undefin', 'o', 'firstchild', 'tagnam', 'px', 'tg', 'navextra', 'document', 'getelementbyid', 'getelementsbytagnam', 'span', 'event', 'cancelbubbl', 'createel', 'arrai', 'everi', 'createpopup', 'ifram', 'framebord', 'scroll', 'no', 'src', 'javascript', 'parentnod', 'appendchild', 'id', 'for', 'insertbefor', 'classnam', 'click', 'close', 'while', 'do', 'offsetleft', 'offsetpar', 'ss', 'statu', 'TRUE', 'cs', 'ga', 'substr', 'r', 'target', 'srcelement', 'locat', 'href', 'clk', 'url', 'ct', 'cd', 'cad', 'sg', 'rwurl', 'escap', 'replac', 'rdh', 'rdp', 'length', 'host', 'hostnam', 'pathnam', 'field', 'split', 'break', 'new', 'imag', 'blogsearch', 'sa', 'ei', 'rl', 'nln', 'continu', 'sig', 'colonpo', 'indexof', 'posidarrai', 'pb', 'number', 'nan', 'onmousedown', 're', 'web', 'map', 'shop', 'gmail', 'more', 'video', 'group', 'book', 'scholar', 'financ', 'youtub', 'calendar', 'photo', 'reader', 'even', 'raquo', 'sign', 'nbsp', 'advanc', 'prefer', 'result', 'about', 'second', 'toggletoadvanc', 'noadvanc', 'togglefromadvanc', 'publish', 'last', 'hour', 'dai', 'past', 'week', 'month', 'anytim', 'write', 'choos', 'date', 'hide', 'form', 'action', 'http', 'com', 'daterang', 'name', 'onsubmit', 'formatdateinput', 'input', 'as_q', 'type', 'hidden', 'valu', 'num', 'hl', 'en', 'as_epq', 'as_eq', 'lr', 'safe', 'ie', 'iso', 'as_mind', 'as_minm', 'as_mini', 'as_maxd', 'as_maxm', 'as_maxi', 'as_drrb', 'ctz', 'start', 'autocomplet', 'off', 'maxlength', 'onblur', 'checkandclos', 'onfocu', 'open', 'end', 'option', 'btnd', 'submit', 'go', 'gettimezoneoffset', 'montharrai', 'jan', 'feb', 'mar', 'apr', 'mai', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'dowarrai', 'su', 'tu', 'th', 'datedelim', 'ymdpermut', 'mdy', '_calendarpopup', 'setinitialcalendarset', 'class', 'ddcalp_dai', 'curr_dai', 'subscrib', 'alert', 'atom', 'rss', 'sort', 'by', 'relev', 'relat', 'care', 'center', 'inform', 'you', 'can', 'us']
rows = rs
print "finish load data..."
filtered_rows = []
for row in rows:
if row[1] not in moreIgnoreWords:
filtered_rows.append(row)
rows = filtered_rows
for row in rows:
if row[2] not in urls:
urls.append(row[2])
if row[1] not in words:
words.append(row[1])
make_index_dict(rows)
get_csv_output(rows,words,urls)
if __name__ =="__main__":
start()
|