之前可能已经提出过这些问题,但我永远无法理解,因为它与我的代码无关。是的我引用了python for loops
我无法绕过它
Main.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
glimit = 5
def craigslist_spider(max_pages):
page = 100
while page <= max_pages:
url = 'https://orlando.craigslist.org/search/apa?s=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "lxml")
limit = glimit
mysoup = soup.findAll('a',{'class':'result-title hdrlnk'})[:limit]
mysoup2 = soup.findAll('time',{'class':'result-date'})[:limit]
for link in mysoup:
for link2 in mysoup2:
href = "https://orlando.craigslist.org" + link.get('href')
title = link.string
date = link2.string
owl = (date) + (title)
print owl
# get_single_item_data(href)
page += 100
# def get_single_item_data(item_url):
# source_code = requests.get(item_url)
# plain_text = source_code.text
# soup = BeautifulSoup(plain_text, "lxml")
# limit = glimit
# mysoup = soup.findAll('div',{'class': 'mapaddress' })[:limit]
# for item in mysoup:
# if item in mysoup is not None:
# print (item.string)
# else:
# print("No Address")
craigslist_spider(100)
产生5次重复内容。
答案 0 :(得分:0)
外循环运行5次并重复内循环。这些不应该嵌套;否则你会多次重复相同的内部链接。
而是循环遍历<p class="result-info">
上的 findall()。在每个段落中,对"result-date"
子元素执行单个 find(),在"result-title"
元素上执行另一个 find()。
for result in soup.findAll('p', {'class': 'result-info'})[:limit]:
link = result.find('a',{'class':'result-title hdrlnk'})
link2 = result.find('time',{'class':'result-date'})
href = "https://orlando.craigslist.org" + link.get('href')
title = link.string
date = link2.string
owl = (date) + (title)
print owl
...