我有数千个html文件存储在远程目录中。所有这些文件都具有相同的HTML结构。现在我正在使用以下脚本手动抓取每个文件
from string import punctuation, whitespace
import urllib2
import datetime
import re
from bs4 import BeautifulSoup as Soup
import csv
today = datetime.date.today()
html = urllib2.urlopen("http://hostname/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html").read()
soup = Soup(html)
for li in soup.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
所以上面的脚本是针对一个URL的。同样明智的是,我想清除该目录下的所有html文件,而不管文件名是什么。我没有发现这个问题已被问到。
更新:代码
import urllib2
import BeautifulSoup
import re
Newlines = re.compile(r'[\r\n]\s+')
def getPageText(url):
# given a url, get page content
data = urllib2.urlopen(url).read()
# parse as html structured document
bs = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
# kill javascript content
for li in bs.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
def main():
urls = [
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html',
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html.html'
]
txt = [getPageText(url) for url in urls]
if __name__=="__main__":
main()
答案 0 :(得分:1)
使用循环:
...
for url in url_list:
html = urllib2.urlopen(url).read()
soup = Soup(html)
for li in soup.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
如果您事先不知道网址列表,则必须解析列表页面。
import csv
import urllib2
import BeautifulSoup
def getPageText(url, filename):
data = urllib2.urlopen(url).read()
bs = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
with open(filename, 'w') as f:
writer = csv.writer(f)
for li in bs.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
sSpan = li.find('span', attrs={'class':'st'})
writer.writerow([sLink['href'], sSpan])
def main():
urls = [
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html',
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html.html',
]
for i, url in enumerate(urls, 1):
getPageText(url, '{}.csv'.format(i))
if __name__=="__main__":
main()