如何从表`tr`和`td`中提取数据?

时间:2018-05-21 17:08:58

标签: python

我想在这里抓取一个网站是一个包含表tr和td的链接(http://legacy-assignments.uspto.gov/assignments/q?db=pat&qt=rf&reel=044841&frame=0221&pat=&pub=&intn=&asnr=&asnri=&asne=&asnei=&asns=)。 如果我从url更改了卷轴和框架,那么结果会发生变化,所以我想创建一个模块,在该模块中我运行循环并抓取所有数据。但我坚持循环或者我是新手来抓取HTML解析器网站。

请帮我找出这个网站的完美代码。 我从csv给出卷轴和帧的输入,我知道如何做到这一点我只想知道如何从html解析器中提取数据

这是我的代码: -

from selenium import webdriver
from bs4 import BeautifulSoup
import requests
from time import sleep
import csv
import re
from lxml import html
import lxml.html

doc=lxml.html.parse("http://legacy-assignments.uspto.gov/assignments/q?db=pat&qt=rf&reel=044841&frame=0221&pat=&pub=&intn=&asnr=&asnri=&asne=&asnei=&asns=")

for row in doc.xpath('/html/body/table[3]/tbody/tr'):


    try:
        reelframe1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td[2]/table/tbody/tr[1]/td[2]/a/text()')[0]
        reelframe=reelframe1.strip().encode('utf8')
        print reelframe
    except:
        print error
    try:
        recorded1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td[2]/table/tbody/tr[2]/td[4]/text()')[0]
        recorded= recorded1.strip().encode('utf8')
        print recorded
    except:
        print error
    try:
        attorney1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td[2]/table/tbody/tr[3]/td[2]/span/text()')[0]
        attorney =attorney1.strip().encode('utf8')
        print attorney
    except:
        print error

    try:
        conveyance1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td[2]/table/tbody/tr[4]/td[2]/span/text()')[0]
        conveyance= conveyance1.strip().encode('utf8')
        print conveyance
    except:
        print error

    try:
        totalproperties1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[1]/td/div/text()')[0]
        totalproperties= totalproperties1.strip().encode('utf8').replace('                          ','').replace('Total properties:','').strip()   
        print totalproperties
    except:
        print error


    try:
        patent1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[2]/div/a/text()')[0]
        patent= patent1.strip().encode('utf8')
        print patent              
    except:
        print error

    try:
        issuedate1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[4]/div/text()')[0]
        issuedate = issuedate1.strip().encode('utf8')
        print issuedate
    except:
        print error


    try:
        application1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[6]/div/text()')[0]
        application=application1.strip().encode('utf8')
        print application
    except:
        print error

    try:
        filing=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[8]/div/text()')[0]
        filingdate=filing.strip().encode('utf8')
        print filingdate
    except:
        print error

    try:
        publication=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[2]/td[2]/div/a/text()')[0]
        publicationno=publication.strip().encode('utf8')
        print publicationno
    except:
        print error

    try:
        pubdt=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[2]/td[4]/div/text()')[0]
        publicationdate=pubdt.strip().encode('utf8')
        print publicationdate
    except:
        print error

    try:
        title1=row.xpath('/html/body/table[3]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td[2]/table/tbody/tr[3]/td[2]/div/text()')[0]
        title=title1.strip().encode('utf8')
        print title
    except:
        print error

0 个答案:

没有答案