遍历网站以形成列表

时间:2015-11-02 07:08:05

标签: python list

我正在尝试创建一个网络爬虫,它将遍历网站上的表格并将该表格的各种值放入列表中。我创建了爬虫并成功从网站获取信息。我还创建了一个代码,用于将表中的信息附加到列表中,但只包含第一个值。

def fetch("https://www.cs.purdue.edu/homes/jind/exchangerate.html"):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    print(content)

    results=[]

    h1Start = content.find('<td>') + len('<td>')
    h1End = content.find('</td>', h1Start)
    h1 = content[h1Start:h1End]
    results.append(h1)
    i=i+1

print(results)

所以这打印出第一个找到的值“货币”,但我需要它来打印一个包含所有值的列表,我不知道如何去做。我只是不知道如何设置迭代。我的数据集应该看起来像下面的结果

 [['Argentine Peso', [9.44195, 0.10591]], ['Australian Dollar', [1.41824, 0.7051]

3 个答案:

答案 0 :(得分:0)

下面的代码对我有用 - 使用Beautifulsoup!

import urllib
from  bs4 import  BeautifulSoup
sample_url ='https://www.cs.purdue.edu/homes/jind/exchangerate.html'

def fetch(url):
    import urllib
    response = urllib.urlopen(url)
    #response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    #print(content)
    soup = BeautifulSoup(content,'html.parser')
    results=[]
    for tds in soup.find_all('tr')[1:]:
        data = tds.find_all('td')
        name = data[0].text
        rate = [i.text for i in data[1:]]
        results.append(name)
        results.append(rate)
    print results

fetch(sample_url)

打印

[u'Argentine Peso', [u'9.44195', u'0.10591'], u'Australian Dollar', [u'1.41824', u'0.70510'], u'Bahraini Dinar', [u'0.37743', u'2.64953'], u'Botswana Pula', [u'10.58050', u'0.09451'], u'Brazilian Real', [u'3.93609', u'0.25406'], u'Bruneian Dollar', [u'1.43290', u'0.69789'], u'Bulgarian Lev', [u'1.74144', u'0.57424'], u'Canadian Dollar', [u'1.31585', u'0.75997'], u'Chilean Peso', [u'689.09439', u'0.00145'], u'Chinese Yuan Renminbi', [u'6.35520', u'0.15735'], u'Colombian Peso', [u'3036.83683', u'0.00033'], u'Croatian Kuna', [u'6.79750', u'0.14711'], u'Czech Koruna', [u'24.22620', u'0.04128'], u'Danish Krone', [u'6.65480', u'0.15027'], u'Euro', [u'0.89186', u'1.12125'], u'Hong Kong Dollar', [u'7.75023', u'0.12903'], u'Hungarian Forint', [u'277.95000', u'0.00360'], u'Icelandic Krona', [u'127.01477', u'0.00787'], u'Indian Rupee', [u'65.21894', u'0.01533'], u'Indonesian Rupiah', [u'14733.50000', u'0.00007'], u'Iranian Rial', [u'29958.05872', u'0.00003'], u'Israeli Shekel', [u'3.91230', u'0.25560'], u'Japanese Yen', [u'119.87500', u'0.00834'], u'Kazakhstani Tenge', [u'271.55834', u'0.00368'], u'South Korean Won', [u'1172.93798', u'0.00085'], u'Kuwaiti Dinar', [u'0.30235', u'3.30743'], u'Libyan Dinar', [u'1.35500', u'0.73801'], u'Malaysian Ringgit', [u'4.41250', u'0.22663'], u'Mauritian Rupee', [u'35.69937', u'0.02801'], u'Mexican Peso', [u'16.75875', u'0.05967'], u'Nepalese Rupee', [u'104.90395', u'0.00953'], u'New Zealand Dollar', [u'1.55135', u'0.64460'], u'Norwegian Krone', [u'8.38615', u'0.11924'], u'Omani Rial', [u'0.38510', u'2.59673'], u'Pakistani Rupee', [u'104.47498', u'0.00957'], u'Philippine Peso', [u'46.59361', u'0.02146'], u'Polish Zloty', [u'3.78735', u'0.26404'], u'Qatari Riyal', [u'3.64165', u'0.27460'], u'Romanian New Leu', [u'3.94165', u'0.25370'], u'Russian Ruble', [u'66.04697', u'0.01514'], u'Saudi Arabian Riyal', [u'3.75020', u'0.26665'], u'Singapore Dollar', [u'1.43290', u'0.69789'], u'South African Rand', [u'13.73598', u'0.07280'], u'Sri Lankan Rupee', [u'141.11000', u'0.00709'], u'Swedish Krona', [u'8.35875', u'0.11964'], u'Swiss Franc', [u'0.97144', u'1.02941'], u'Taiwan New Dollar', [u'32.70499', u'0.03058'], u'Thai Baht', [u'36.42047', u'0.02746'], u'Trinidadian Dollar', [u'6.34523', u'0.15760'], u'Turkish Lira', [u'2.99055', u'0.33439'], u'Emirati Dirham', [u'3.67295', u'0.27226'], u'British Pound', [u'0.65872', u'1.51810'], u'Venezuelan Bolivar', [u'6.34961', u'0.15749'], u'Latvian Lat', [u'0.62680', u'1.59541'], u'Lithuanian Litas', [u'3.07941', u'0.32474']]

如果没有额外的库代码可以使用,但强烈建议不要 -

import urllib,re

sample_url ='https://www.cs.purdue.edu/homes/jind/exchangerate.html'
results=[]

class unicode(unicode):
    def __repr__(self):
        return __builtins__.unicode.__repr__(self).lstrip("u")

def fetch(url):
    import urllib
    response = urllib.urlopen(url)
    #response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    trs=re.findall(r'<tr>(.*?)</tr>', content, re.M|re.I|re.S)
    for td in trs[1:]:
        data= re.findall(r'(?i)<td.*?>([^<]+)</td.*?>', td)
        name=data[0]
        rate =[float(data[1]),float(data[2])]
        results.append([unicode(name),rate])

    print results

fetch(sample_url)

打印 -

[['Argentine Peso', [9.44195, 0.10591]], ['Australian Dollar', [1.41824, 0.7051]], ['Bahraini Dinar', [0.37743, 2.64953]], ['Botswana Pula', [10.5805, 0.09451]], ['Brazilian Real', [3.93609, 0.25406]], ['Bruneian Dollar', [1.4329, 0.69789]], ['Bulgarian Lev', [1.74144, 0.57424]], ['Canadian Dollar', [1.31585, 0.75997]], ['Chilean Peso', [689.09439, 0.00145]], ['Chinese Yuan Renminbi', [6.3552, 0.15735]], ['Colombian Peso', [3036.83683, 0.00033]], ['Croatian Kuna', [6.7975, 0.14711]], ['Czech Koruna', [24.2262, 0.04128]], ['Danish Krone', [6.6548, 0.15027]], ['Euro', [0.89186, 1.12125]], ['Hong Kong Dollar', [7.75023, 0.12903]], ['Hungarian Forint', [277.95, 0.0036]], ['Icelandic Krona', [127.01477, 0.00787]], ['Indian Rupee', [65.21894, 0.01533]], ['Indonesian Rupiah', [14733.5, 7e-05]], ['Iranian Rial', [29958.05872, 3e-05]], ['Israeli Shekel', [3.9123, 0.2556]], ['Japanese Yen', [119.875, 0.00834]], ['Kazakhstani Tenge', [271.55834, 0.00368]], ['South Korean Won', [1172.93798, 0.00085]], ['Kuwaiti Dinar', [0.30235, 3.30743]], ['Libyan Dinar', [1.355, 0.73801]], ['Malaysian Ringgit', [4.4125, 0.22663]], ['Mauritian Rupee', [35.69937, 0.02801]], ['Mexican Peso', [16.75875, 0.05967]], ['Nepalese Rupee', [104.90395, 0.00953]], ['New Zealand Dollar', [1.55135, 0.6446]], ['Norwegian Krone', [8.38615, 0.11924]], ['Omani Rial', [0.3851, 2.59673]], ['Pakistani Rupee', [104.47498, 0.00957]], ['Philippine Peso', [46.59361, 0.02146]], ['Polish Zloty', [3.78735, 0.26404]], ['Qatari Riyal', [3.64165, 0.2746]], ['Romanian New Leu', [3.94165, 0.2537]], ['Russian Ruble', [66.04697, 0.01514]], ['Saudi Arabian Riyal', [3.7502, 0.26665]], ['Singapore Dollar', [1.4329, 0.69789]], ['South African Rand', [13.73598, 0.0728]], ['Sri Lankan Rupee', [141.11, 0.00709]], ['Swedish Krona', [8.35875, 0.11964]], ['Swiss Franc', [0.97144, 1.02941]], ['Taiwan New Dollar', [32.70499, 0.03058]], ['Thai Baht', [36.42047, 0.02746]], ['Trinidadian Dollar', [6.34523, 0.1576]], ['Turkish Lira', [2.99055, 0.33439]], ['Emirati Dirham', [3.67295, 0.27226]], ['British Pound', [0.65872, 1.5181]], ['Venezuelan Bolivar', [6.34961, 0.15749]], ['Latvian Lat', [0.6268, 1.59541]], ['Lithuanian Litas', [3.07941, 0.32474]]]

答案 1 :(得分:0)

你不必知道正则表达式,只需简单的循环即可。但强烈建议不要使用beautifulsoup进行解析。 代码:

import urllib2
request = urllib2.Request("https://www.cs.purdue.edu/homes/jind/exchangerate.html")
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')

def between_td(conte,start):
    start = conte.find('<td>',start)+len('<td>')
    end = conte.find('</td>',start)
    return start,end

def read_content(cont):
    start = 0
    end = 0
    result = []
    while True:
        start = cont.find('<tr>',start)
        end = cont.find('</tr>',start)
        if end>0:
            row = []
            start, end = between_td(cont,start)
            row.append(content[start:end])
            counter = 2
            columns = []
            while counter:
                start,end = between_td(cont,start)
                columns.append(content[start:end])
                counter = counter - 1
            row.append(columns)
        else:
            break
        result.append(row)
    print result
read_content(content)

并打印:

[u'Argentine Peso', [u'9.44195', u'0.10591']], [u'Australian Dollar', [u'1.41824', u'0.70510']], [u'Bahraini Dinar',

如果你发现这个答案不合适,请告诉我。我会删除它并且它没有太多pythonic,但是为了你的目的而且可以很容易阅读。

答案 2 :(得分:0)

有我的解决方案。可能它看起来很难看,但它也有效。

from urllib.request import urlopen

def find_element(line, s_pattern, e_pattern, position=0):
    shift = len(s_pattern)
    start = line.find(s_pattern, position) + shift
    position = start
    end = line.find(e_pattern, position)
    return (line[start:end], position)

html = urlopen("https://www.cs.purdue.edu/homes/jind/exchangerate.html")
result = []
i = 0
for line in html.readlines():
    line = line.decode()
    if "<tr><td>" not in line:
        continue  # skip if line don't contain rows
    if "Currency" in line:
        continue  # skip header

    start = "<tr><td>"
    end = "</td>"
    element, start_pos = find_element(line, start, end)
    result.append([element])
    start = "<td>"
    values = []
    for x in range(2):
        element, start_pos = find_element(line, start, end, start_pos)
        values.append(element)
    result[i].append(values)
    i = i + 1
print(result)