无法以所需方式从网页中获取两个字段

时间:2019-08-13 13:44:14

标签: python python-3.x web-scraping

我已经在python中创建了一个脚本,以从位于网页中的表中获取两个字段(第二列和第三列)timecurrency。该脚本正在获取结果,但并非我希望的那样。

Website address

到目前为止,我已经写过:

import requests
from bs4 import BeautifulSoup

URL = "https://www.forexfactory.com/calendar.php?week=this"

res = requests.get(URL)
soup = BeautifulSoup(res.text,"lxml")

for item in soup.select("tr.calendar_row"):
    ftime = item.select_one("td.calendar__time").get_text(strip=True)
    currency = item.select_one("td.calendar__currency").get_text(strip=True)
    print(ftime,currency)

我得到的结果:

All Day JPY
5:00am CNY
 CNY
2:00pm USD
1:59am JPY
2:00am EUR
 EUR
4:30am GBP
 GBP
 GBP

预期结果:

All Day JPY
3:00pm CNY
3:00pm CNY
2:00pm USD
1:59am JPY
12:00pm EUR
12:00pm EUR
2:30pm GBP
2:30pm GBP
2:30pm GBP
  

我得到的 与该网站不同。此外,我希望使用其 早期值 填充 空白时间

如何修改现有脚本以获取上面显示的结果?

2 个答案:

答案 0 :(得分:4)

问题在于时间列中的单元格为空

import requests
from bs4 import BeautifulSoup

URL = "https://www.forexfactory.com/calendar.php?week=this"

# Make cookie dictionary for setting timezones
cookies={
    "fftimezoneoffset":"0", #timezone / UTC +/-X
    "fftimeformat":"1", # format 0=am/pm / 1=24hour format
    "ffdstonoff":"1", # daylight saving
    "ffverifytimes":"1" # set times to timezone
}
res = requests.get(URL,cookies=cookies) # apply timezones
soup = BeautifulSoup(res.text,"lxml")
lastTime = "" #lastTime for cases with empty values for times
for item in soup.select("tr.calendar_row"):

    ftime = item.select_one("td.calendar__time").get_text(strip=True)
    if len(ftime) == 0: #if empty time use last one 
        ftime = lastTime
    lastTime = ftime
    currency = item.select_one("td.calendar__currency").get_text(strip=True)
    if len(currency) > 0: # print if there is currenty 
        print(ftime,currency)

答案 1 :(得分:1)

您可以将数据附加到对象内部,并在丢失时存储先前的值:

import requests
from bs4 import BeautifulSoup

URL = "https://www.forexfactory.com/calendar.php?week=this"

res = requests.get(URL)
soup = BeautifulSoup(res.text,"lxml")

results = []
for item in soup.select("tr.calendar_row"):
    day = item.select_one("td.calendar__date").get_text(strip=True)
    ftime = item.select_one("td.calendar__time").get_text(strip=True)
    currency = item.select_one("td.calendar__currency").get_text(strip=True)
    print(ftime, currency)
    if not ftime:
        ftime = previoustime
    if not day:
        day = previousday
    if currency:
        results.append(
                {
                'day': day,
                'time': ftime,
                'cur': currency
                }
        )
    previoustime = ftime
    previousday = day

from pprint import pprint
pprint(results)

结果:

[{'cur': 'JPY', 'day': 'SunAug 11', 'time': 'All Day'},
 {'cur': 'CNY', 'day': 'MonAug 12', 'time': '5:00am'},
 {'cur': 'CNY', 'day': 'MonAug 12', 'time': '5:00am'},
 {'cur': 'USD', 'day': 'MonAug 12', 'time': '2:00pm'},
 {'cur': 'NZD', 'day': 'MonAug 12', 'time': '6:45pm'},
 {'cur': 'AUD', 'day': 'MonAug 12', 'time': '6:50pm'},
 {'cur': 'JPY', 'day': 'MonAug 12', 'time': '7:50pm'},
 {'cur': 'AUD', 'day': 'MonAug 12', 'time': '9:30pm'},
 {'cur': 'CNY', 'day': 'MonAug 12', 'time': '10:11pm'},
 {'cur': 'JPY', 'day': 'TueAug 13', 'time': '12:30am'},
 {'cur': 'JPY', 'day': 'TueAug 13', 'time': '1:59am'},
 {'cur': 'EUR', 'day': 'TueAug 13', 'time': '2:00am'},
 {'cur': 'EUR', 'day': 'TueAug 13', 'time': '2:00am'},
 {'cur': 'GBP', 'day': 'TueAug 13', 'time': '4:30am'},
 {'cur': 'GBP', 'day': 'TueAug 13', 'time': '4:30am'},
 {'cur': 'GBP', 'day': 'TueAug 13', 'time': '4:30am'},
 {'cur': 'EUR', 'day': 'TueAug 13', 'time': '5:00am'},
 {'cur': 'EUR', 'day': 'TueAug 13', 'time': '5:00am'},
 {'cur': 'GBP', 'day': 'TueAug 13', 'time': '5:35am'},
 {'cur': 'USD', 'day': 'TueAug 13', 'time': '6:00am'},
 {'cur': 'USD', 'day': 'TueAug 13', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'TueAug 13', 'time': '8:30am'},
 {'cur': 'GBP', 'day': 'TueAug 13', 'time': '9:30am'},
 {'cur': 'JPY', 'day': 'TueAug 13', 'time': '7:50pm'},
 {'cur': 'AUD', 'day': 'TueAug 13', 'time': '8:30pm'},
 {'cur': 'AUD', 'day': 'TueAug 13', 'time': '9:30pm'},
 {'cur': 'CNY', 'day': 'TueAug 13', 'time': '10:00pm'},
 {'cur': 'CNY', 'day': 'TueAug 13', 'time': '10:00pm'},
 {'cur': 'CNY', 'day': 'TueAug 13', 'time': '10:00pm'},
 {'cur': 'CNY', 'day': 'TueAug 13', 'time': '10:00pm'},
 {'cur': 'EUR', 'day': 'WedAug 14', 'time': '2:00am'},
 {'cur': 'EUR', 'day': 'WedAug 14', 'time': '2:45am'},
 {'cur': 'AUD', 'day': 'WedAug 14', 'time': '3:30am'},
 {'cur': 'GBP', 'day': 'WedAug 14', 'time': '4:30am'},
 {'cur': 'GBP', 'day': 'WedAug 14', 'time': '4:30am'},
 {'cur': 'GBP', 'day': 'WedAug 14', 'time': '4:30am'},
 {'cur': 'GBP', 'day': 'WedAug 14', 'time': '4:30am'},
 {'cur': 'GBP', 'day': 'WedAug 14', 'time': '4:30am'},
 {'cur': 'GBP', 'day': 'WedAug 14', 'time': '4:30am'},
 {'cur': 'EUR', 'day': 'WedAug 14', 'time': '5:00am'},
 {'cur': 'EUR', 'day': 'WedAug 14', 'time': '5:00am'},
 {'cur': 'EUR', 'day': 'WedAug 14', 'time': '5:00am'},
 {'cur': 'EUR', 'day': 'WedAug 14', 'time': 'Tentative'},
 {'cur': 'USD', 'day': 'WedAug 14', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'WedAug 14', 'time': '14th-18th'},
 {'cur': 'USD', 'day': 'WedAug 14', 'time': '10:30am'},
 {'cur': 'AUD', 'day': 'WedAug 14', 'time': '7:00pm'},
 {'cur': 'AUD', 'day': 'WedAug 14', 'time': '9:00pm'},
 {'cur': 'AUD', 'day': 'WedAug 14', 'time': '9:30pm'},
 {'cur': 'AUD', 'day': 'WedAug 14', 'time': '9:30pm'},
 {'cur': 'JPY', 'day': 'ThuAug 15', 'time': '12:30am'},
 {'cur': 'EUR', 'day': 'ThuAug 15', 'time': 'All Day'},
 {'cur': 'EUR', 'day': 'ThuAug 15', 'time': 'All Day'},
 {'cur': 'CHF', 'day': 'ThuAug 15', 'time': '2:30am'},
 {'cur': 'GBP', 'day': 'ThuAug 15', 'time': '4:30am'},
 {'cur': 'CAD', 'day': 'ThuAug 15', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '9:15am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '9:15am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '10:00am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '10:00am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '10:30am'},
 {'cur': 'USD', 'day': 'ThuAug 15', 'time': '4:00pm'},
 {'cur': 'NZD', 'day': 'ThuAug 15', 'time': '6:30pm'},
 {'cur': 'EUR', 'day': 'FriAug 16', 'time': '5:00am'},
 {'cur': 'CAD', 'day': 'FriAug 16', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'FriAug 16', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'FriAug 16', 'time': '8:30am'},
 {'cur': 'USD', 'day': 'FriAug 16', 'time': '10:00am'},
 {'cur': 'USD', 'day': 'FriAug 16', 'time': '10:00am'}]