我想在这里抓取一个网站是一个包含表tr和td的链接(http://legacy-assignments.uspto.gov/assignments/q?db=pat&qt=rf&reel=044841&frame=0221&pat=&pub=&intn=&asnr=&asnri=&asne=&asnei=&asns=)。 如果我从url更改了卷轴和框架,那么结果会发生变化,所以我想创建一个模块,在该模块中我运行循环并抓取所有数据。但我坚持循环或者我是新手来抓取HTML解析器网站。
请帮我找出这个网站的完美代码。 我从csv给出卷轴和帧的输入,我知道如何做到这一点我只想知道如何从html解析器中提取数据
这是我的代码: -
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
from time import sleep
import csv
import re
from lxml import html
import lxml.html
doc=lxml.html.parse("http://legacy-assignments.uspto.gov/assignments/q?db=pat&qt=rf&reel=044841&frame=0221&pat=&pub=&intn=&asnr=&asnri=&asne=&asnei=&asns=")
for row in doc.xpath('/html/body/table[3]/tbody/tr'):
try:
reelframe1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td[2]/table/tbody/tr[1]/td[2]/a/text()')[0]
reelframe=reelframe1.strip().encode('utf8')
print reelframe
except:
print error
try:
recorded1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td[2]/table/tbody/tr[2]/td[4]/text()')[0]
recorded= recorded1.strip().encode('utf8')
print recorded
except:
print error
try:
attorney1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td[2]/table/tbody/tr[3]/td[2]/span/text()')[0]
attorney =attorney1.strip().encode('utf8')
print attorney
except:
print error
try:
conveyance1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td[2]/table/tbody/tr[4]/td[2]/span/text()')[0]
conveyance= conveyance1.strip().encode('utf8')
print conveyance
except:
print error
try:
totalproperties1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[1]/td/div/text()')[0]
totalproperties= totalproperties1.strip().encode('utf8').replace(' ','').replace('Total properties:','').strip()
print totalproperties
except:
print error
try:
patent1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[2]/div/a/text()')[0]
patent= patent1.strip().encode('utf8')
print patent
except:
print error
try:
issuedate1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[4]/div/text()')[0]
issuedate = issuedate1.strip().encode('utf8')
print issuedate
except:
print error
try:
application1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[6]/div/text()')[0]
application=application1.strip().encode('utf8')
print application
except:
print error
try:
filing=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[8]/div/text()')[0]
filingdate=filing.strip().encode('utf8')
print filingdate
except:
print error
try:
publication=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[2]/td[2]/div/a/text()')[0]
publicationno=publication.strip().encode('utf8')
print publicationno
except:
print error
try:
pubdt=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[2]/td[4]/div/text()')[0]
publicationdate=pubdt.strip().encode('utf8')
print publicationdate
except:
print error
try:
title1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[3]/td[2]/div/text()')[0]
title=title1.strip().encode('utf8')
print title
except:
print error