我正在尝试确定记录从原始网页链接的网页链接的网页内容的最简单方法。我希望我的输出是一个表格,其行对应于页面深层第三层的内容。
从代码中可以看出,我目前只能在第三级页面上获得所需项目的第一个实例。此外,虽然我当前的代码将返回与基本URL上的每个h2项对应的一行,但我希望每个h2项有多行(尽管有“span.'case-doc-details'a”的实例第二层)。
其他一些信息:在每个链接状态下,我不知道将链接多少页面。我正在使用Python和Scraperwiki,两者都是新手。我试图研究这个问题,但是在我对要问什么的知识上遇到了障碍。提前感谢您的帮助。
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_table(root):
rows = root.cssselect("h2")
record = {}
counter=0
for row in rows:
table_cells = row.cssselect("h2 a")
for cell in table_cells:
record['Count']=counter
table_cellsurls = table_cells[0].cssselect("a")
record['CaseURL'] = table_cellsurls[0].attrib.get('href')
caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
#print caselinkurl
caseroots = lxml.html.fromstring(caselinkurl)
title=caseroots.cssselect("title")
record['Title'] = title[0].text_content()
ids=caseroots.cssselect("div div div div a")
for i in ids:
if len(ids)<=2:
record['Rules']="None"
record['Treaty']="None"
else:
record['Rules']=ids[2].text_content()
record['Treaty']=ids[3].text_content()
pars = caseroots.cssselect("span.'case-doc-details' a")
#print "pars length is", len(pars)
caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[0].attrib.get('href')).read()
caseroots2=lxml.html.fromstring(caselinkurl2)
#create another table element with rows, marked off with the case that they came from, create all the rows.
for i in pars:
if len(pars)==0:
record['DetailsURL']="None"
else:
record['DetailsURL']=pars[0].attrib.get('href')
pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'")
if len(pars2)==0:
record['Doc Date']="None"
else:
record['Doc Date']=pars2[0].text_content()
pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
if len(pars3) ==0:
record['Doc Type Link']="None"
record['Doc Type']="None"
else:
record['Doc Type Link']=pars3[0].attrib.get('href')
record['Doc Type']=pars3[0].text_content()
pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
if len(pars4)==0:
record['Claimant Nominee']="None"
else:
record['Claimant Nominee']=pars4[0].text_content()
pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
if len(pars5)==0:
record['Respondent Nominee']="None"
else:
record['Respondent Nominee']=pars5[0].text_content()
pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
if len(pars6)==0:
record['President']="None"
else:
record['President']=pars6[0].text_content()
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)
答案 0 :(得分:0)
这是我到目前为止所获得的代码 - 这还没有获取文档链接数据(或保存任何内容),但这应该是将原则扩展到另一个函数的情况:
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_page(linkurl):
html = scraperwiki.scrape(linkurl)
root = lxml.html.fromstring(html)
title = root.cssselect("h1")
print "the title:", title[0].text
record = {}
record['title'] = title[0].text
record['url'] = linkurl
#<div class="field-items"><div class="field-item even"><a
arbrules = root.cssselect("div.field-items a")
if arbrules:
record['arbruleurl'] = arbrules[0].attrib.get("href")
record['arbrule'] = arbrules[0].text_content()
else:
record['arbruleurl'] = "NO URL"
record['arbrule'] = "NO ARBRULE"
legalbasis = root.cssselect("div.field-label")
if legalbasis:
record['legalbasis'] = legalbasis[0].text_content()
else:
record['legalbasis'] = "NO LEGAL BASIS GIVEN"
extralinks = []
contents = root.cssselect("div.view-content a")
if contents:
for content in contents:
extralinks.append(content.text_content())
extralinks.append(content.attrib.get("href"))
record['extralinks'] = extralinks
else:
record['extralinks'] = "NO EXTRA LINKS"
#record['firstparty'] = title[0].text.split(" v. ")[0]
#record['secondparty'] = title[0].text.split(" v. ")[1]
#record['casenumber'] = title[0].text.split(" Case No.")[1]
print record
def scrape_table(root):
links = root.cssselect("div.link-wrapper a")
for link in links:
print link.text_content()
linkurl = link.attrib.get("href")
print linkurl
scrape_page('http://www.italaw.com'+linkurl)
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)
答案 1 :(得分:0)
以下是我为此问题所做的工作。
一些指导性的一般观点:
使用if else循环来区分零长度与关键属性的非零长度的情况。
在此之前,请创建您的词典。
感谢保罗推动这一进展。
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_table(root):
rows = root.cssselect("h2")
counter=0
for row in rows:
table_cells = row.cssselect("h2 a")
for cell in table_cells:
table_cellsurls = table_cells[0].cssselect("a")
#record['CaseURL'] = table_cellsurls[0].attrib.get('href')
caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
#print caselinkurl
caseroots = lxml.html.fromstring(caselinkurl)
pars = caseroots.cssselect("span.'case-doc-details' a")
#print "pars length is", len(pars)
record = {}
#create another table element with rows, marked off with the case that they came from, create all the rows.
if len(pars)==0:
record['DetailsURL']="None"
record['Count']=counter
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
else:
for i in range(0,len(pars)):
record['Count']=counter
caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[i].attrib.get('href')).read()
caseroots2=lxml.html.fromstring(caselinkurl2)
record['DetailsURL']=pars[i].attrib.get('href')
title=caseroots2.cssselect("h2")
record['Title'] = title[1].text_content()
rules=caseroots2.cssselect("div.'field-name-field-arbitration-rules'")
if len(rules)==0:
record['Rules']="None"
else:
record['Rules']=rules[0].text_content()
treaty=caseroots2.cssselect("div.'field-name-field-case-treaties'")
if len(treaty)==0:
record['Treaty']="None"
else:
record['Treaty']=treaty[0].text_content()
pars2=caseroots2.cssselect("div.'field-name-field-case-document-date'")
if len(pars2)==0:
record['Doc Date']="None"
else:
record['Doc Date']=pars2[0].text_content()
pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
if len(pars3) ==0:
record['Doc Type Link']="None"
record['Doc Type']="None"
else:
record['Doc Type Link']=pars3[0].attrib.get('href')
record['Doc Type']=pars3[0].text_content()
pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
if len(pars4)==0:
record['Claimant Nominee']="None"
else:
record['Claimant Nominee']=pars4[0].text_content()
pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
if len(pars5)==0:
record['Respondent Nominee']="None"
else:
record['Respondent Nominee']=pars5[0].text_content()
pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
if len(pars6)==0:
record['President']="None"
else:
record['President']=pars6[0].text_content()
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)