我练习刮一个网站。 我有一些神秘的情况。
import requests
from bs4 import BeautifulSoup
import json
class n_auction(object):
def __init__(self):
self.search_request = {
'lawsup':0,
'lesson':0,
'next_biddate1':'',
'next_biddate2':'',
'state':91,
'b_count1':0,
'b_count2':0,
'b_area1':'',
'b_area2':'',
'special':0,
'e_area1':'',
'e_area2':'',
'si':11,
'gu':0,
'dong':0,
'apt_no':0,
'order':'',
'start':60,
'total_record_val':850,
'detail_search':'',
'detail_class':'',
'recieveCode':'',}
self.headers = {'User-Agent':'Mozilla/5.0',
'Referer':'http://goodauction.land.naver.com/auction/ca_list.php'}
def scrape(self, max_pages):
addr = []
pageno = 0
self.search_request['start'] = pageno
while pageno < max_pages:
payload = json.dumps(self.search_request)
r = requests.post('http://goodauction.land.naver.com/auction/ax_list.php', data=payload ,headers=self.headers)
print(r.text)
s = BeautifulSoup(r.text)
print(s)
if __name__ == '__main__':
scraper = n_auction()
scraper.scrape(30)
但经过beautifulsoup, 我失去了一些价值,如下图。
这很令人尴尬。帮帮我~~
答案 0 :(得分:1)
将解析器从默认值lxml
切换到html.parser
为我工作。
尝试:s = BeautifulSoup(r.text, 'html.parser')