我试图从中国住房网站上搜集一些住房信息。我运行时代码没有错误。但是,运行过程完成时没有输出文件。
import requests
from bs4 import BeautifulSoup
import sys
import os
import time
import pandas as pd
import numpy as np
from parsel import Selector
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'
}
def catchHouseList(url):
resp = requests.get(url, headers=headers, stream=True)
if resp.status_code == 200:
reg = re.compile('<li.*?class="clear">.*?<a.*?class="img.*?".*?href="(.*?)"')
urls = re.findall(reg, resp.text)
return urls
return []
def catchHouseDetail(url):
resp = requests.get(url, headers=headers)
print(url)
if resp.status_code == 200:
info = {}
soup = BeautifulSoup(resp.text, 'html.parser')
info['Title'] = soup.select('.main')[0].text
info['Total_Price'] = soup.select('.total')[0].text
info['Unit_Price'] = soup.select('.unit')[0].text
info['Price_per_square'] = soup.select('.unitPriceValue')[0].text
# p = soup.select('.tax')
# info['Reference_price'] = soup.select('.tax')[0].text
info['Built_time'] = soup.select('.subInfo')[2].text
info['Place_Name'] = soup.select('.info')[0].text
info['Area'] = soup.select('.info a')[0].text + ':' + soup.select('.info a')[1].text
info['Lianjia_number'] = str(url)[34:].rsplit('.html')[0]
info['flooring_plan'] = str(soup.select('.content')[2].select('.label')[0].next_sibling)
info['floor'] = soup.select('.content')[2].select('.label')[1].next_sibling
info['Area_Size'] = soup.select('.content')[2].select('.label')[2].next_sibling
info['Flooring_structure'] = soup.select('.content')[2].select('.label')[3].next_sibling
info['Inner_Area'] = soup.select('.content')[2].select('.label')[4].next_sibling
info['Building_Category'] = soup.select('.content')[2].select('.label')[5].next_sibling
info['House_Direction'] = soup.select('.content')[2].select('.label')[6].next_sibling
info['Building_Structure'] = soup.select('.content')[2].select('.label')[7].next_sibling
info['Decoration'] = soup.select('.content')[2].select('.label')[8].next_sibling
info['Stair_Number'] = soup.select('.content')[2].select('.label')[9].next_sibling
info['Heating'] = soup.select('.content')[2].select('.label')[10].next_sibling
info['Elevator'] = soup.select('.content')[2].select('.label')[11].next_sibling
# info['Aseest_Year'] = str(soup.select('.content')[2].select('.label')[12].next_sibling)
return info
pass
def appendToXlsx(info):
fileName = './second_hand_houses.xlsx'
dfNew = pd.DataFrame([info])
if (os.path.exists(fileName)):
sheet = pd.read_excel(fileName)
dfOld = pd.DataFrame(sheet)
df = pd.concat([dfOld, dfNew])
df.to_excel(fileName)
else:
dfNew.to_excel(fileName)
def catch():
pages = ['https://zs.lianjia.com/ershoufang/guzhenzhen/pg{}/'.format(x) for x in range(1, 21)]
for page in pages:
print(page)
houseListURLs = catchHouseList(page)
for houseDetailUrl in houseListURLs:
try:
info = catchHouseDetail(houseDetailUrl)
appendToXlsx(info)
except:
pass
time.sleep(2)
pass
if __name__ == '__main__':
catch()
我期望有一个excel输出,但是最终没有任何结果。只告诉我该流程以退出代码0结尾。
答案 0 :(得分:0)
这是您的问题区域之一,只需重新编写一下即可帮助您查看。当状态码不是200时,您将返回一个空列表,没有任何警告或说明。脚本的其余部分需要一个列表才能继续运行。返回空列表时,它会干净地退出。
现在,当您运行代码时,当服务器响应不是200时,此函数将返回None,然后在catch()函数中引发TypeError,这将需要进一步的错误处理。
def catchHouseList(url):
try:
resp = requests.get(url, headers=headers, stream=True)
if resp.status_code == 200:
reg = re.compile(
'<li.*?class="clear">.*?<a.*?class="img.*?".*?href="(.*?)"')
urls = re.findall(reg, resp.text)
return urls
else:
print('catchHouseList response code:', resp.status_code)
except Exception as e:
print('catchHouseList:', e)