目的是搜索infile(html)并重现outfile中可以传递给wget的任何图像的URL。这将是我用Python编写的第一个有用的东西,它似乎在Fedora上运行良好。我无法在任何地方找到任何特别喜欢的东西。有没有人有改进的建议?
import fileinput
import re
#replace 'output.txt' with the name of your outfile
file = open('output.txt', 'w')
#prefix and postfix are how we discriminate your substring from the infile's line
prefix = '<img src='
postfix = '.jpg'
#read through the infile line-by-line
for line in fileinput.input():
if re.search(prefix, line):
#from if above, if you find the prefix, assign the integer to first_index
first_index = line.index(prefix)
if re.search(postfix, line):
#same as comment above, but for postfix
second_index = line.index(postfix)
#write your string plus an newline to the outfile
file.write(line[first_index+prefix.__len__():second_index+postfix.__len__()]+'\n')
答案 0 :(得分:0)
我过去做过类似的事情并且工作得很好......我相信它会比用正则表达式解析更准确。
from HTMLParser import HTMLParser
class ImageFinder(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.file = open('output.txt', 'w')
def handle_starttag(self, tag, attrs):
if tag == "img":
url = [u[1] for u in attrs if u[0] == "src"][0]
self.file.write(url+"\n")
def __exit__(self):
self.file.close()
inputdata = open("myfile.txt").read()
parser = ImageFinder()
parser.feed(inputdata)