我已经在几个地方看到过这个问题,但似乎无法让我在下面的脚本中使用它来解析几个页面直到结束:
我的脚本应该在页面循环中,但每当我把它放在里面我都会收到缩进错误。这是否意味着我需要缩进整个脚本?或者它是不适用于我的脚本的循环?
from bs4 import BeautifulSoup
import requests
page = 1
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4808&siteid=1&h=0&pageno={page}"
#"https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4740&siteid=1&h=0&pageno=14"
# add header
mozila_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
headers = {'User-Agent': mozila_agent}
with requests.Session() as session:
while True:
response = session.get(urldes.format(page=page), headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
########## HOW TO parse the pages and collect the results here ?
if page is 3 : #soup.find('u') is None:
break # last page
page += 1
############################################################
the_whole_table = soup.find('table', width='97%')
datalist = []
for tr in the_whole_table.find_all('tr')[1:]:
# you want to start from the 1st item not the 0th so [1:]
# Because the first is the thead i.e. Lot no, Picture, Lot Title...
index_num = tr.find('td', width='8%')
picture_link = index_num.next_sibling.a['data-img']
text_info = tr.find('td', width='41%')
current_bid = tr.find('td', width='13%')
time_left = tr.find('td', width='19%')
datalist.append([index_num.text, picture_link,
text_info.text, current_bid.text, time_left.text])
# for pic do ... print(picture_link) as for partial text only first 20
# characters
index = datalist[0][0]
picture = datalist[0][1]
info = datalist[0][2]
bid = datalist[0][3]
time = datalist[0][4]
df = ['Index Number', 'Picture', 'Informational text',
'Current BID', 'Time Left now']
theads = BeautifulSoup('<table style="width:50%; color: blue; font-family: verdana; font-size: 60%;"></table>', 'lxml')
thekeys = BeautifulSoup('<thead style="color: blue; font-family: verdana; font-size: 60%;"></thead>', 'html.parser')
#counter = 0
for i in df:
tag = theads.new_tag('th')
tag.append(i)
thekeys.thead.append(tag)
theads.table.append(thekeys)
###############################################################
# The code above will initiate a table
# after that the for loop will create and populate the first row (thead)
for i in datalist:
# thedata = BeautifulSoup('<tr style="color: blue; font-family: verdana; font-size: 50%;"></tr>', 'html.parser')
thedata = BeautifulSoup('<tr></tr>', 'html.parser')
# we loop through the data we collected
# initiate a <td> </td> tag everytime we finish with one collection
for j in i:
if j.startswith('https'):
img_tag = theads.new_tag('img', src=j, width='300')
td_tag = theads.new_tag('td')
td_tag.append(img_tag)
thedata.append(td_tag)
# counter += 1
else:
# tag = theads.new_tag('td', style="color: blue; font-family: verdana; font-size: 50%;")
tag = theads.new_tag('td')
tag.append(j)
thedata.append(tag)
# counter += 1
# if counter is 5:
# counter = 0
theads.table.append(thedata)
#print(counter)
css = "<style>{color: blue; font-family: verdana; font-size: 50%;}</style>"
#css.string = css
with open('asdf.html', 'w+') as f:
f.write(theads.prettify())
print(css)
# each of these if you print them you'll get a information that you can store
# to test do print(index_num.text, text_info.text)
关于template.html和css,我可以看到我需要使用抓取的数据填充模板,但是例如,如果我想在“拍卖”的“值”中分配5个元素中的每一个,我似乎无法找到一种分别分配每个元素的方法。当前代码似乎遍历每个值,但在我需要为每个值赋予不同的类标记的情况下,我需要区分所有5个元素,我无法弄清楚如何做。
for auction in auctions:
div_a = soup.new_tag("div", _class="auction")
soup.append(div_a)
for description in auction:
div_d = soup.new_tag("h4", _class="title")
div_d.append(description)
div_a.append(div_d)
正如你在这里看到的,我可以附加标签但重复相同的值而不是循环遍历每个值。
答案 0 :(得分:1)
您可以将您的脚本放在页面循环中并在请求之间构建HTML,或者首先获得所有拍卖,将它们存储在某些数据结构中,例如:一个列表然后循环它将行附加到HTML。我采用了第二种方法,因为它更接近你已有的方法。我稍微重构了一下代码并添加了:
if not soup.find_all('a', string='Next'):
break
在找不到内置a
文字的Next
标记后,它会中断循环,这意味着它是最后一页。我不喜欢的是为表创建标题并以编程方式插入样式。如果我是你,我只是创建一个&#34;模板&#34;包含所有样式,页面标题,表格并从文件中读取的HTML。然后,您可以将行追加到表的tbody
。您还可以包含拍卖的链接,而不是裸文本。
以下代码可用,并在上次检查时生成一个包含1300多个竞价的HTML:
import itertools
from collections import namedtuple
import requests
from bs4 import BeautifulSoup
url = 'https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4808&siteid=1&h=0&pageno={}'
auctions = []
Auction = namedtuple('auction',
['index', 'picture_link', 'description', 'current_bid', 'time_left'])
for page in itertools.count(start=1):
response = requests.get(url.format(page))
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', width='97%')
for tr in table.find_all('tr')[1:]: # skip the table header
tds = tr.contents
index = tds[0].find('h5').text
pic_elem = tds[1].find('img') # html of this img tag is broken, so I create a new one below
picture = soup.new_tag('img', src=pic_elem['src'], width=pic_elem['width'])
description = tds[2].find('h5').text
current_bid = tds[3].find('h5').text
time_left = tds[4].find('h5').text
auction = Auction(index, picture, description, current_bid, time_left)
auctions.append(auction)
if not soup.find_all('a', string='Next'):
break
headers = ['Index Number', 'Picture', 'Informational text', 'Current BID', 'Time Left now']
soup = BeautifulSoup(
'''
<table>
<thead>
<tr>
</tr>
</thead>
<tbody></tbody>
</table>
''', 'lxml')
# you can also insert these <th> manually, since they're not dynamic :P
for header in headers:
th = soup.new_tag('th')
th.append(header)
soup.table.thead.tr.append(th)
for auction in auctions:
tr = soup.new_tag('tr')
for value in auction:
td = soup.new_tag('td')
td.append(value)
tr.append(td)
soup.table.tbody.append(tr)
# this can also be included in the initial html, no need to do it programmatically
head = soup.new_tag('head')
head.append(soup.new_tag('meta', charset='utf-8'))
head.append(soup.new_tag('style', type='text/css'))
head.style.append(
'''
body {
font-family: verdana;
color: blue;
}
table {
width: 70%;
margin: auto;
font-size: 0.8em;
}
''')
soup.html.insert(0, head)
with open('auctions.html', 'w') as f:
f.write(soup.prettify())
答案 1 :(得分:0)
所以我设法用下面的代码解决了问题的第二部分。我没有尝试修改“拍卖”对象中已有的元素,而是直接访问了源代码并在提取时添加了标记。
我遇到的另一个问题是在div标签中添加一个类,需要通过添加**来完成,例如soup.new_tag("div", **{'class':'auction'})
import itertools
from collections import namedtuple
import requests
from bs4 import BeautifulSoup
#saleid = '4793'
saleid = '4811'
url = 'https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=' + saleid + '&siteid=1&h=0&pageno={}'
auctions = []
Auction = namedtuple('auction',
['index', 'picture_link', 'description', 'current_bid', 'time_left'])
for page in itertools.count(start=1):
response = requests.get(url.format(page))
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', width='97%')
for tr in table.find_all('tr')[1:]: # skip the table header
tds = tr.contents
index = tds[0].find('h5').text
index_tag = soup.new_tag("h5", **{'class':'index'})
index_tag.append(index)
pic_elem = tds[1].find('img') # html of this img tag is broken, so I create a new one below
picture = soup.new_tag('img', **{'class':'image'}, src=pic_elem['src'], width="160")
# picture = soup.new_tag('img', **{'class':'image'}, src=pic_elem['src'], width=pic_elem['width'])
description = tds[2].find('h5').text
description_tag = soup.new_tag("h4", **{'class':'title'})
description_tag.append(description)
current_bid = tds[3].find('h5').text
current_bid_tag = soup.new_tag("h4", **{'class':'price'})
current_bid_tag.append(current_bid)
time_left = tds[4].find('h5').text
time_left_tag = soup.new_tag("h5", **{'class':'time'})
time_left_tag.append(time_left)
auction = Auction(index_tag, picture, description_tag, current_bid_tag, time_left_tag)
auctions.append(auction)
if not soup.find_all('a', string='Next'):
break
soup = BeautifulSoup(
'''
<div class="container">
</div>
''', 'lxml')
for auction in auctions:
div_a = soup.new_tag("div", **{'class':'auction'})
soup.div.append(div_a)
for value in auction:
div_a.append(value)
# this can also be included in the initial html, no need to do it programmatically
head = soup.new_tag('head')
head.append(soup.new_tag('meta', charset='utf-8'))
head.append(soup.new_tag('style', type='text/css'))
head.style.append(
'''
* {
margin: 0;
}
.container {
font-family: "Arial";
padding: 5px;
display: grid;
justify-items: center;
grid-gap: 5px;
grid-template-columns: repeat(5, 1fr);
text-transform: capitalize;
}
.auction {
display: grid;
grid-template-columns: 140px auto;
grid-template-areas:
"title title time"
"image image image"
"image image image"
"image image image"
"price price index";
width: 300px;
height: 300px;
border: 2px black solid;
font-size: 12px;
}
.image {
grid-area: image;
margin: left
}
.title {
grid-area: title;
text-transform: lowercase;
}
.price {
grid-area: price;
}
.time {
grid-area: time;
}
.index {
grid-area: index;
}
.title, .price, .time, .index {
padding: 10px;
}
''')
soup.html.insert(0, head)
with open('auctions.html', 'w') as f:
f.write(soup.prettify())