我最近在学习python网页抓取,我在http://search.proquest.com/练习,网站需要购买,但我认为大多数大学都有。
from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import urllib.error
import http.cookiejar
import requests
import pymysql
conn = pymysql.connect(host ='localhost',user = 'root',passwd = 'gaojia',db = 'mysql')
cur = conn.cursor()
# cur.execute("CREATE DATABASE wsj")
cur.execute("USE wsj")
# cur.execute("CREATE TABLE wsj.list (id INT NOT NULL AUTO_INCREMENT, url VARCHAR(255) NOT NULL, title VARCHAR(1000) , abstract VARCHAR(20000),created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id))")
issues = set() # to store url of desired issues
articles = set() # to store url of articles of an given issue
def store(title,abstract,url):
cur.execute("INSERT INTO pages (title,abstract,url) VALUES (%s,%s,%s)",(title,abstract,url))
cur.connection.commit()
# point to the monthly url
# session = requests.session()
# r = requests.post(URL, data=payload)
# print (r.cookies)
try:
session = requests.Session()
#url = 'http://search.proquest.com/publication.publicationissuebrowse:drilldown/month/%E5%85%AB%E6%9C%88/08/year/2016/parentmonth082016'
#payload = {'site':'news','t:ac':'publications_105983'}
url = 'http://search.proquest.com/publication.publicationissuebrowse:drilldown/month/%E5%85%AB%E6%9C%88/08/year/2016/parentmonth082016'
payload = {"site": "news","t:ac" : "publications_105983"}
headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36 SE 2.X MetaSr 1.0','Accept':'text/javascript, text/html,application/xml, text/xml, */*',\
'Accept-Encoding':'gzip, deflate','Accept-Language':'zh-CN,zh;q=0.8','Host':'search.proquest.com', 'Content-type':'application/x-www-form-urlencoded; charset=UTF-8', 'Connection':'keep-alive','Content-Length':'0','Origin':'http://search.proquest.com','Referer':'http://search.proquest.com/news/publication/105983/citation/99D2C84D41804033PQ/2?accountid=13818','X-Prototype-Version':'1.7','X-Requested-With':'XMLHttpRequest',\
'Cookie':'availability-zone=us-east-1c; mwtbid=830706AE-9389-4BB4-812D-B597683B812E; _ga=GA1.2.1201070524.1446763952; fsr.r=%7B%22d%22%3A90%2C%22i%22%3A%22de07553-78769885-bcc1-4823-67c96%22%2C%22e%22%3A1467984529571%7D; fulltextShowAll=YES; JSESSIONID=752EDA8BA4C3B6791CDE9ABFEB2BB3A1.i-c1aa5150; authenticatedBy=IP; OS_VWO_COUNTRY=CN; OS_VWO_INSTITUTION=13818; OS_VWO_LANGUAGE=zho; OS_VWO_MY_RESEARCH=false; OS_VWO_REFERRING_URL="http://ourex.lib.sjtu.edu.cn/primo_library/libweb/action/display.do; OS_VWO_REQUESTED_URL="http://search.proquest.com/"; OS_PERSISTENT="wrPZtfJDrH0WIWT5cZZs+CwLAAUhJMHD++Vls3rVx5E="; OS_VWO_VISITOR_TYPE=returning; AWSELB=C393A78D02CA3EE2799CF8894B23627240E8CACE66375056E6D341D7DA668019371E729BF574DF4C7B461B13FCAAE8A127CA655E3AFFDA10D2742B23FD55F3B713F0A97E539C751AC7BD616C8D55DEF2CCCF1762B2; oneSearchTZ=480; AppVersion=r20161.6.0.834.574; availability-zone=us-east-1c; _vwo_uuid_v2=0308785C38305F47209E7EC8811AC0A2|3ec2dd2ac5e7bfcc195a554e24406f22; osTimestamp=1472007458.508; WT_FPC=id=202.120.14.195-2899434048.30480412:lv=1471960659244:ss=1471959777563; fsr.s=%7B%22cp%22%3A%7B%22Usage_Session%22%3A%2220160824004842532%3A222017%22%2C%22cxreplayaws%22%3A%22true%22%2C%22Error_Page%22%3A%22no%22%2C%22No_Results%22%3A%22no%22%2C%22My_Research%22%3A%22no%22%2C%22Advanced%22%3A%22no%22%2C%22Professional%22%3A%22no%22%2C%22User_IP%22%3A%22202.120.19.182%22%2C%22Session_ID%22%3A%22752EDA8BA4C3B6791CDE9ABFEB2BB3A1.i-c1aa5150%22%2C%22Account_ID%22%3A%2213818%22%7D%2C%22v1%22%3A-2%2C%22v2%22%3A-2%2C%22rid%22%3A%22de07553-78727480-20ae-90fb-1186c%22%2C%22ru%22%3A%22http%3A%2F%2Fourex.lib.sjtu.edu.cn%2Fprimo_library%2Flibweb%2Faction%2Fdisplay.do%3Bjsessionid%3D81B5C8F2B4E21E549ADB7E9BAC4C3C04%3Ftabs%3DdetailsTab%26ct%3Ddisplay%26fn%3Dsearch%26doc%3Dsjtulibxw000061822%26indx%3D1%26recIds%3Dsjtulibxw000061822%26recIdxs%3D0%26elementId%3D0%26renderMode%3DpoppedOut%26displayMode%3Dfull%26frbrVersion%3D%26dscnt%3D0%26scp.scps%3Dscope%253A%2528SJT%2529%252Cscope%253A%2528sjtu_metadata%2529%252Cscope%253A%2528sjtu_sfx%2529%252Cscope%253A%2528sjtulibzw%2529%252Cscope%253A%2528sjtulibxw%2529%252CDuxiuBook%26tab%3Ddefault_tab%26dstmp%3D1471999665891%26vl(freeText0)%3Dproquest%26vid%3Dchinese%22%2C%22r%22%3A%22ourex.lib.sjtu.edu.cn%22%2C%22st%22%3A%22%22%2C%22to%22%3A5%2C%22pv%22%3A26%2C%22lc%22%3A%7B%22d0%22%3A%7B%22v%22%3A26%2C%22s%22%3Atrue%7D%7D%2C%22cd%22%3A0%2C%22f%22%3A1472007439637%2C%22pn%22%3A0%2C%22sd%22%3A0%7D; _ga=GA1.3.1201070524.1446763952; _gat_UA-61126923-3=1'}
req = session.post(url,data = payload,headers = headers)
bs0bj = BeautifulSoup(req.text,"html.parser")
try:
print (bs0bj.find(parentId ).get_text())#.encode("gb18030")
except AttributeError:
print ("attributes missing")
except urllib.error.HTTPError as reason:
print(reason)
有这个代码的结果,如果我使用bs0bj.find("某些")。get_text(),它将返回no,但是" bs0bj&中确实有很多东西#34;,这里有什么不对