问题:脚本成功运行,没有任何错误,但是Mysql数据库不会使用脚本结果进行更新
我添加了db.autocommit(True)
行来进行每次提交,但仍然失败。
环境:python2.7,MySQL
我也尝试过在每个带有锁的语句执行后手动输入db.commit()
,但也会失败。
'''
Specifications:
A multi-Threaded Web Spider that:
Takes website and depth of spidering as input
Downloads the HTML files only
Inserts the HTML into an MYSQL database
It also parses the forms on each page and inserts into db with form details
'''
import mechanize
import sys
import threading
import MySQLdb
lock = threading.Lock()
def Parse_Forms(target,curr,br):
lock.acquire()
br.open(target)
curr.execute("use web;");
response = []
for forms in br.forms():
i= 0
action = forms.action
method = forms.method
d = dict()
d['method'] = method
d['name'] = action
br.select_form(nr=i)
for control in forms.controls:
if control.value == '':
d[control.name] = "NULL"
elif type(control.value) is list:
d[control.name] = control.value[0]
else:
d[control.name] = control.value
for j in d:
if str(j) == 'login' or str(j) == 'name' or str(j) == 'password' or str(j) == 'method': #These are only the valid names that has to be inserted in MYSQL db
query = "INSERT INTO `forms` ("+str(j)+") values (\""+str(d[j])+"\");"
curr.execute(query)
print "Query Executed!"
i=i+1
response.append(br.submit())
lock.release()
def getHTMLfiles(target,curr):
br = mechanize.Browser()
headers = [('User-Agent','Firefoxy'),]
br.addheaders = headers
br.open(target)
for i in range(0,depth):
for link in br.links():
if ".hmtl" in link.url:
print "Downloading File: "+link.url
os.system("wget "+link.url+" -P Files/")
curr.execute("INSERT INTO `pages` (name) values ("+ "\"link.url\");")
if link.url[0] == '/' and not '.' in link.url: #Indicates that file belongs to server not some external link and is a directory
Parse_Forms(target+link.url,curr,br,db)
if __name__ == "__main__":
db = MySQLdb.connect(host="localhost",user="****",password="*****",db="web")
#There are 2 db one called pages that saves HTML file url and one forms that saves form parameteres
db.autocommit(True)
curr = db.cursor()
target = sys.argv[1]
depth = int(sys.argv[2])
threads = []
for workers in range(10):
t = threading.Thread(target = getHTMLfiles,args = (target,curr,))
t.daemon = True
t.start()
threads.append(t)
for thread in threads:
thread.join()
该脚本可以正常运行,但应该可以更新MySQL数据库。
注意:似乎一切正常,没有MYSQL错误(我的意思是没有错误)