Python脚本只扫描一个项目(分类页面)

时间:2017-08-30 02:15:17

标签: python web-scraping beautifulsoup

Python Scraper只带来1个项目......

我对python来说相对较新,而且我制作了一个脚本来废弃我所在国家/地区的分类页面。到目前为止,剧本似乎只能抓住一个真正让我疯狂的项目,因为我现在已经尝试修复它一周,而且我真的不知道任何可以提供帮助的人。我很感激,如果有人可以看一看,并试着解释一下我在这里做错了什么。

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'http://www.clasificadosonline.com/UDMiscListingID.asp?MiscCat=75'

# opening ip connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

#HTML PARSER
page_soup = soup(page_html, "html5lib") #se cambio de "html.parser" a "html5lib por que jodia el closing form tag"

containers = page_soup.findAll("form",{"name":"listing"})

#testing variables
tags = containers[0].findAll("a", {"class":"Tahoma16Blacknounder"})
tagx = tags[0].text.strip()

filename = "products.csv"
f = open(filename, "w")

headers = "names, prices, city, product_condition\n"

f.write(headers)

for container in containers:
#holds the names of the classifieds
names_container = container.findAll("a", {"class":"Tahoma16Blacknounder"})
names = names_container[0].text.strip() # comment here later

#the span class"Tahoma14BrownNound" seems to hold the prices
#container.findAll("span", {"class":"Tahoma14BrownNound"})
#the span class
prices_container = container.findAll("span", {"class":"Tahoma14BrownNound"})
prices = prices_container[0].text # comment here later

#holds the city of use of the products
city_container = container.findAll("font", {"class":"tahoma14hbluenoUnder"})
city = city_container[0].text.strip() # comment here later

#holds the states of use of the products
product_condition_container = container.findAll("span", {"class":"style14 style15 style16"})
product_condition = product_condition_container[0].text # comment here later

print("names: " + names)
print("prices: " + prices)
print("city: " + city)
print("product_condition: " + product_condition)

f.write(names.replace(",", "|") + "," + prices + "," + city + "," + product_condition + "\n")

f.close()

1 个答案:

答案 0 :(得分:0)

我查看了网站结构,并且在表单之后错过了对表的解析。

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'http://www.clasificadosonline.com/UDMiscListingID.asp?MiscCat=75'

# opening ip connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

#HTML PARSER
page_soup = soup(page_html, "html5lib") #se cambio de "html.parser" a "html5lib por que jodia el closing form tag"

containers = page_soup.findAll("form",{"name":"listing"})

#testing variables
tags = containers[0].findAll("a", {"class":"Tahoma16Blacknounder"})
tagx = tags[0].text.strip()

filename = "products.csv"
f = open(filename, "w")

headers = "names, prices, city, product_condition\n"

f.write(headers)

tr = containers[0].findAll('tr', {"valign":"middle"})

for container in tr:

if len(container.findAll("a", {"class":"Tahoma16Blacknounder"})) > 0:
    #holds the names of the classifieds
    names_container = container.findAll("a", {"class":"Tahoma16Blacknounder"})
    names = names_container[0].text.strip() # comment here later

    #the span class"Tahoma14BrownNound" seems to hold the prices
    #container.findAll("span", {"class":"Tahoma14BrownNound"})
    #the span class
    prices_container = container.findAll("span", {"class":"Tahoma14BrownNound"})
    prices = prices_container[0].text if len(prices_container) > 0 else ''

    #holds the city of use of the products
    city_container = container.findAll("font", {"class":"tahoma14hbluenoUnder"})
    city = city_container[0].text.strip() # comment here later

    #holds the states of use of the products
    product_condition_container = container.findAll("span", {"class":"style14 style15 style16"})
    product_condition = product_condition_container[0].text # comment here later

    print("names: " + names)
    print("prices: " + prices)
    print("city: " + city)
    print("product_condition: " + product_condition)

f.write(names.replace(",", "|") + "," + prices + "," + city + "," + product_condition + "\n")

f.close()