Scrape存储在远程目录中的html文件

时间:2013-09-23 05:36:23

标签: python python-2.7 beautifulsoup screen-scraping

我有数千个html文件存储在远程目录中。所有这些文件都具有相同的HTML结构。现在我正在使用以下脚本手动抓取每个文件

from string import punctuation, whitespace
import urllib2
import datetime
import re
from bs4 import BeautifulSoup as Soup
import csv
today = datetime.date.today()
html = urllib2.urlopen("http://hostname/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html").read()

soup = Soup(html)
for li in soup.findAll('li', attrs={'class':'g'}):
    sLink = li.find('a')
    print sLink['href']
    sSpan = li.find('span', attrs={'class':'st'})
    print sSpan

所以上面的脚本是针对一个URL的。同样明智的是,我想清除该目录下的所有html文件,而不管文件名是什么。我没有发现这个问题已被问到。

更新:代码

import urllib2
import BeautifulSoup
import re

Newlines = re.compile(r'[\r\n]\s+')

def getPageText(url):
    # given a url, get page content
 data = urllib2.urlopen(url).read()
    # parse as html structured document
 bs = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
    # kill javascript content
 for li in bs.findAll('li', attrs={'class':'g'}):
  sLink = li.find('a')
  print sLink['href']
  sSpan = li.find('span', attrs={'class':'st'})
  print sSpan
def main():
    urls = [
        'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html',
        'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html.html'
    ]
    txt = [getPageText(url) for url in urls]

if __name__=="__main__":
    main()    

1 个答案:

答案 0 :(得分:1)

使用循环:

...

for url in url_list:
    html = urllib2.urlopen(url).read()

    soup = Soup(html)
    for li in soup.findAll('li', attrs={'class':'g'}):
        sLink = li.find('a')
        print sLink['href']
        sSpan = li.find('span', attrs={'class':'st'})
        print sSpan

如果您事先不知道网址列表,则必须解析列表页面。


import csv
import urllib2

import BeautifulSoup


def getPageText(url, filename):
    data = urllib2.urlopen(url).read()
    bs = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
    with open(filename, 'w') as f:
        writer = csv.writer(f)
        for li in bs.findAll('li', attrs={'class':'g'}):
            sLink = li.find('a')
            sSpan = li.find('span', attrs={'class':'st'})
            writer.writerow([sLink['href'], sSpan])

def main():
    urls = [
        'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html',
        'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html.html',
    ]
    for i, url in enumerate(urls, 1):
        getPageText(url, '{}.csv'.format(i))

if __name__=="__main__":
    main()