我对python来说相对较新,而且我制作了一个脚本来废弃我所在国家/地区的分类页面。到目前为止,剧本似乎只能抓住一个真正让我疯狂的项目,因为我现在已经尝试修复它一周,而且我真的不知道任何可以提供帮助的人。我很感激,如果有人可以看一看,并试着解释一下我在这里做错了什么。
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://www.clasificadosonline.com/UDMiscListingID.asp?MiscCat=75'
# opening ip connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML PARSER
page_soup = soup(page_html, "html5lib") #se cambio de "html.parser" a "html5lib por que jodia el closing form tag"
containers = page_soup.findAll("form",{"name":"listing"})
#testing variables
tags = containers[0].findAll("a", {"class":"Tahoma16Blacknounder"})
tagx = tags[0].text.strip()
filename = "products.csv"
f = open(filename, "w")
headers = "names, prices, city, product_condition\n"
f.write(headers)
for container in containers:
#holds the names of the classifieds
names_container = container.findAll("a", {"class":"Tahoma16Blacknounder"})
names = names_container[0].text.strip() # comment here later
#the span class"Tahoma14BrownNound" seems to hold the prices
#container.findAll("span", {"class":"Tahoma14BrownNound"})
#the span class
prices_container = container.findAll("span", {"class":"Tahoma14BrownNound"})
prices = prices_container[0].text # comment here later
#holds the city of use of the products
city_container = container.findAll("font", {"class":"tahoma14hbluenoUnder"})
city = city_container[0].text.strip() # comment here later
#holds the states of use of the products
product_condition_container = container.findAll("span", {"class":"style14 style15 style16"})
product_condition = product_condition_container[0].text # comment here later
print("names: " + names)
print("prices: " + prices)
print("city: " + city)
print("product_condition: " + product_condition)
f.write(names.replace(",", "|") + "," + prices + "," + city + "," + product_condition + "\n")
f.close()
答案 0 :(得分:0)
我查看了网站结构,并且在表单之后错过了对表的解析。
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://www.clasificadosonline.com/UDMiscListingID.asp?MiscCat=75'
# opening ip connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML PARSER
page_soup = soup(page_html, "html5lib") #se cambio de "html.parser" a "html5lib por que jodia el closing form tag"
containers = page_soup.findAll("form",{"name":"listing"})
#testing variables
tags = containers[0].findAll("a", {"class":"Tahoma16Blacknounder"})
tagx = tags[0].text.strip()
filename = "products.csv"
f = open(filename, "w")
headers = "names, prices, city, product_condition\n"
f.write(headers)
tr = containers[0].findAll('tr', {"valign":"middle"})
for container in tr:
if len(container.findAll("a", {"class":"Tahoma16Blacknounder"})) > 0:
#holds the names of the classifieds
names_container = container.findAll("a", {"class":"Tahoma16Blacknounder"})
names = names_container[0].text.strip() # comment here later
#the span class"Tahoma14BrownNound" seems to hold the prices
#container.findAll("span", {"class":"Tahoma14BrownNound"})
#the span class
prices_container = container.findAll("span", {"class":"Tahoma14BrownNound"})
prices = prices_container[0].text if len(prices_container) > 0 else ''
#holds the city of use of the products
city_container = container.findAll("font", {"class":"tahoma14hbluenoUnder"})
city = city_container[0].text.strip() # comment here later
#holds the states of use of the products
product_condition_container = container.findAll("span", {"class":"style14 style15 style16"})
product_condition = product_condition_container[0].text # comment here later
print("names: " + names)
print("prices: " + prices)
print("city: " + city)
print("product_condition: " + product_condition)
f.write(names.replace(",", "|") + "," + prices + "," + city + "," + product_condition + "\n")
f.close()