我想比较一下表的行,看看它们是否相等, 我做的是创建2个游标 1.从表中选择链接= yes的链接 2.从表中选择链接=链接的链接=否 使用for循环和if语句我想比较访问过的链接与未访问过的链接是否相等,如果它们相等,那么将该链接更新为" YES" 还没有完成(我的目的是在所有链接都被访问并且所有标记为YES或光标为"其中被访问=否"返回空值时退出程序) 我的部分代码:
import sys
import MySQLdb
import urllib
import urlparse
import re
import HTMLParser
from HTMLParser import HTMLParseError
from bs4 import BeautifulSoup
mydb = MySQLdb.connect(host='localhost',
user='root',
passwd='shailang',
db='mydb')
cursor = mydb.cursor()
def process2(url):
flag=0
cursor.execute("SELECT links FROM DATA_urls where visited = 'Ye'")
Yes_rows = cursor.fetchall()
cursor.execute("SELECT links FROM DATA_urls where visited = 'No'")
No_rows = cursor.fetchall()
for No_links in No_rows:
print 'NOOOOOOOOOO'
k= No_links
print k
for Yes_links in Yes_rows:
print "YESSSSSSSSSSSSSS"
k1 = Yes_links
print k1
if k1 == k :
print 'EQUALS'
cursor.execute("UPDATE DATA_urls SET visited = 'Ye' where links = %s",k)
mydb.commit()
def process(url):
proxies = {"http":"http://proxy4.nehu.ac.in:3128",
"https":"https://proxy4.nehu.ac.in:3128"}
page = urllib.urlopen(url,proxies=None)
text = page.read()
page.close()
soup = BeautifulSoup(text)
file=open('s.txt','w')
cursor.execute("INSERT INTO DATA_urls(links,parent,visited) VALUES(%s,'NULL','Ye')",url)
for tag in soup.findAll('a', href=True):
tag['href'] = urlparse.urljoin(url, tag['href'])
print tag['href']
if re.match(ur'(?i)\b((?:https?:\/\/|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))',tag['href']):
cursor.execute("INSERT INTO DATA_urls(links,parent,visited) VALUES(%s,%s,'No')", (tag['href'],url))
file.write('\n')
file.write(tag['href'])
#file.close()
# cursor.execute("SELECT * FROM url")
# rows = cursor.fetchall()
mydb.commit()
process2(1)
def main():
if len(sys.argv) == 1:
print 'No url !!'
sys.exit(1)
for url in sys.argv[1:]:
process(url)
main()
我没有错误,但我的数据库中没有更新任何内容 我的桌子DESC:
+---------+---------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+---------+---------------+------+-----+---------+-------+
| links | varchar(1000) | YES | | NULL | |
| parent | varchar(1000) | YES | | NULL | |
| visited | varchar(2) | YES | | NULL | |
+---------+---------------+------+-----+---------+-------+
答案 0 :(得分:0)
将其更改为 mydb = MySQLdb.connect(host =&#39; localhost&#39;, 用户=&#39;根&#39 ;, 的passwd =&#39; shailang&#39 ;, 分贝=&#39; MYDB&#39;) cursor = mydb.cursor()
def process2(url):
flag=0
cursor.execute("SELECT links FROM DATA_urls where visited = Ye")
Yes_rows = cursor.fetchall()
cursor.execute("SELECT links FROM DATA_urls where visited = No")
No_rows = cursor.fetchall()
count = len(No_rows)
for i in range(0, count):
print 'NOOOOOOOOOO'
k= No_links
print k
for j in range (i+1, count):
print "YESSSSSSSSSSSSSS"
k1 = Yes_links
print k1
if k1 == k :
print 'EQUALS'
cursor.execute("UPDATE DATA_urls SET visited =
'Ye' where links = %s",k)