Question

我有一个代码从特定网页中删除所有内容，我现在想要构建一个可以帮助我了解具体细节的代码，例如，如果我输入样式ID，它应该给我相关的详细信息它，或者如果我输入类别，它应该给我这个类别中的所有项目及其详细信息。我的代码是： -

import requests, re
from bs4 import BeautifulSoup
url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2"
r=requests.get(url)
soup=BeautifulSoup(r.content)
links=soup.find_all("a")
img=soup.find(itemprop="image")
g_d4=soup.find_all("ol", {"class":"breadcrumb"})
for item in g_d4:
    links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/'))
    pattern_2=re.compile("clothing/(\w+)")
    for link in links_2:
        match_1=pattern_2.search(link["href"])
        if match_1:
            print ("Category:- " + match_1.group(1))
            break
g_d1 = soup.find_all("div", {"id": "product-content"})
for item in g_d1:
  try:
      print ("\n\nBRAND:-" + item.contents[1].text)
  except:
      pass
  try:
      a_1=item.find("ol", {"class":"breadcrumb"})
      a_2=a_1.text
      print a_2
  except:
      pass
  try:
      print ("TYPE:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';')
  except:
      pass
  try:
      d2=item.find("div",{"class":"panel-body standard-p"})
      d3=d2.text
      p_id=re.findall(r'[0-9]{9}',d3)
      id_2=p_id[0]
      url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7Citem_page.rr1%7Citem_page.featured_item_0%7Citem_page.featured_item_1%7Citem_page.featured_item_2%7Citem_page.featured_item_3&u=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&s=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&cts=http%3A%2F%2Fwww.barneys.com&chi=%7Cmens-shirts-dress-classic&flv=18.0.0&rcs=eF4NyjEOgCAMBdCFybs0obQfyg28BhRIHNzU88v68sJxf881TDUSq6hYTimWomRGm9gkh9fPZo21olN3qbT3ogUYOcATzpgRP7a2EmY&l=1'
      r_1= requests.get(url_1)
      pattern = re.compile(r'(?<=p=)[0-9]+(?=&)')
      product_ids = pattern.findall(str(r_1.content))
      print ("DETAILS:- " + d3+';')
      print ("\nStyle ID:- " + id_2+';')
      print ("\nRecommended Product ID's:- ")
      print (','.join(i for i in product_ids))
  except:
      pass
  try:
      print ("\nURL:-" + img["src"]+';')
  except:
      pass
  try:
      print ("\nFull Price:-" + item.find("span",{"class":"price-standard"}).text+';')
  except:
      pass
  try:
      print ("\nDiscounted Price:-" + item.find("span",{"class":"price-sales"}).text+';')
  except:
      pass
g_d2=soup.find_all("div", {"class":"color-scroll"})
pattern_1=re.compile("pid=(\w+)")
for item in g_d2:
    links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/Sites-BNY-Site/default/Product-Variation'))
    for link in links_1[1:]:
        match=pattern_1.search(link["href"])
        if match:
            print ("\nProduct ID of other color:-")
            print (match.group(1))

Answer 1

我添加了一个名为d

的词典

import requests, re
from bs4 import BeautifulSoup

d={}

url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2"
r=requests.get(url)
soup=BeautifulSoup(r.content)
links = soup.find_all("a")

d["links"] = []

d["links"].append(("href", [link.get("href") for link in links]))
d["links"].append(("class", [link.get("class") for link in links]))

img=soup.find(itemprop="image")
d["img"] = []

d["img"].append([("alt", img.get("alt")), ("src", img.get("src")), ("itemprop", img.get("itemprop")), ("class", img.get("class")[0])]) #You will have to put d["img"]["0"] instead of d["img"]["alt"]



g_d4=soup.find_all("ol", {"class":"breadcrumb"})
for item in g_d4:
    links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/'))
    pattern_2=re.compile("clothing/(\w+)")
    for link in links_2:
        match_1=pattern_2.search(link["href"])
        if match_1:
            print ("Category:- " + match_1.group(1))
            break
g_d1 = soup.find_all("div", {"id": "product-content"})
for item in g_d1:
  try:
      d["Brand"] = item.contents[1].text
      print ("\n\nBRAND:-" + item.contents[1].text)
  except:
      pass
  try:
      a_1=item.find("ol", {"class":"breadcrumb"})
      a_2=a_1.text
      d["a_2"] = a_2
      print a_2
  except:
      pass
  try:
      print ("TYPE:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';')

      d["Type"] = item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text
  except:
      pass
  try:
      d2=item.find("div",{"class":"panel-body standard-p"})
      d3=d2.text
      p_id=re.findall(r'[0-9]{9}',d3)
      id_2=p_id[0]
      url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7Citem_page.rr1%7Citem_page.featured_item_0%7Citem_page.featured_item_1%7Citem_page.featured_item_2%7Citem_page.featured_item_3&u=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&s=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&cts=http%3A%2F%2Fwww.barneys.com&chi=%7Cmens-shirts-dress-classic&flv=18.0.0&rcs=eF4NyjEOgCAMBdCFybs0obQfyg28BhRIHNzU88v68sJxf881TDUSq6hYTimWomRGm9gkh9fPZo21olN3qbT3ogUYOcATzpgRP7a2EmY&l=1'
      r_1= requests.get(url_1)
      pattern = re.compile(r'(?<=p=)[0-9]+(?=&)')
      product_ids = pattern.findall(str(r_1.content))
      print ("DETAILS:- " + d3+';')
      d["Details"] = d3.split(",")
      print ("\nStyle ID:- " + id_2+';')
      d["Style"] = ("ID", id_2)
      print ("\nRecommended Product ID's:- ")
      print (','.join(i for i in product_ids))
      d["RecommendedProductIDs"] = [i for i in product_ids]
  except:
      pass
  try:
      print ("\nURL:-" + img["src"]+';')
  except:
      pass
  try:
      print ("\nFull Price:-" + item.find("span",{"class":"price-standard"}).text+';')
  except:
      pass
  try:
      print ("\nDiscounted Price:-" + item.find("span",{"class":"price-sales"}).text+';')
  except:
      pass
g_d2=soup.find_all("div", {"class":"color-scroll"})
pattern_1=re.compile("pid=(\w+)")
for item in g_d2:
    links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/Sites-BNY-Site/default/Product-Variation'))
    for link in links_1[1:]:
        match=pattern_1.search(link["href"])
        if match:
            print ("\nProduct ID of other color:-")
            print (match.group(1))

字典python特定键

1 个答案: