以下是我正在使用的代码片段,用于解析网页上的数据
link1 = "https://www.codechef.com/status/" + sys.argv[1] + "?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(link1)
s = response.read()
soup = BeautifulSoup(s)
l = soup.findAll('tr',{'class' : 'kol'})
以下是存储在变量link1
中的示例页面的网址
https://www.codechef.com/status/CIELAB?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO
现在,问题是变量l总是得到一个空列表,即使我试图找到的HTML标记生成的表中有条目。
请帮我解决这个问题。
修改
完整代码
from BeautifulSoup import BeautifulSoup
import urllib2
import os
import sys
import subprocess
import time
import HTMLParser
import requests
html_parser = HTMLParser.HTMLParser()
link = "https://www.codechef.com/status/"+sys.argv[1]+"?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(link)
s = response.read()
soup = BeautifulSoup(s)
try:
l = soup.findAll('div',{'class' : 'pageinfo'})
for x in l:
str_val = str(x.contents)
pos = str_val.find('of')
i = pos+3
x = 0
while i < len(str_val):
if str_val[i] >= str(0) and str_val[i] <= str(9):
x = x*10 + int(str_val[i])
i += 1
except:
x = 1
print x
global lis
lis = list()
break_loop = 0
for i in range(0,x):
print i
if break_loop == 1:
break
if i == 0:
link1 = link
else:
link1 = "https://www.codechef.com/status/"+sys.argv[1]+"?page="+str(i)+"&sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
# opener = urllib2.build_opener()
# opener.addheaders = [('User-agent', 'Mozilla/5.0')]
# response = opener.open(link1)
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = requests.get(link1, headers={'User-Agent': useragent})
# s = response.read()
soup = BeautifulSoup(req.content)
l = soup.findAll('tr',{'class' : r'\"kol\"'})
print l
for val in l:
lang_val = val.find('td',{'width' : '70'})
lang = lang_val.renderContents().strip()
print lang
try:
data = val.find('td',{'width' : '51'})
data_val = data.span.contents
except:
break
if lang != 'PHP':
break_loop = 1
break
if len(data_val) > 1 and html_parser.unescape(data_val[2]) != '100':
continue
str_val = str(val.td.contents)
p = 0
j = 0
while p < len(str_val):
if str_val[p] >= str(0) and str_val[p] <= str(9):
j = j*10 + int(str_val[p])
p += 1
lis.insert(0,str(j))
if len(lis) > 0:
try:
os.mkdir(sys.argv[1]+"_php")
except:
pass
count = 1
for data in lis:
cmd = "python parse_data_final.py "+data+" > "+sys.argv[1]+"_php/"+sys.argv[1]+"_"+str(count)+".php"
subprocess.call(cmd, shell=True)
count += 1
答案 0 :(得分:0)
您的代码不起作用,因为您的课程错误,请尝试使用:
l = soup.findAll('tr',{'class' : r'\"kol\"'})
你也可以得到这样的标签:
l = soup.find('table', {'class': 'dataTable'}).tbody
此外,您可能应该使用请求,具体取决于您正在使用的python版本。这是一个例子:
import requests
from bs4 import BeautifulSoup
url = "https://www.codechef.com/status/CIELAB?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = requests.get(url, headers={'User-Agent': useragent})
soup = BeautifulSoup(req.content, "html.parser")
#l = soup.findAll('tr',{'class' : r'\"kol\"'})
l = soup.find('table', {'class': 'dataTable'}).tbody