因此,我有这个python脚本,用于抓取用户构建的特定craigslist URL的列表(位置,最高价格,商品类型等)。然后,它转到URL,抓取列表信息(价格,发布日期等)并返回三个输出。一种是平均价格附近的“ x”个商品数量(用户确定商品数量和价格范围,例如比平均价格低100美元)。接下来,是基于用户在乞讨中提供的邮政编码的“ x”个壁橱列表(用户还根据与邮政编码的接近程度来确定显示的项数)。最后,将craigslist网址链接输出给用户,以便他们可以访问页面并查看之前显示给他们的项目。抓取的数据存储在data.json文件和data.csv文件中。内容是相同的,只是格式不同,我想在每次抓取完成时将此数据卸载到数据库中。 Cloud Firestore或AWS DynamoDB,因为我希望将来将其托管为网络应用
我想做的是允许用户拥有相同脚本的多个实例,并且所有实例均同时运行唯一的craigslist网址。所有代码都是相同的,唯一的区别是脚本抓取的craigslist网址。
我制作了一种方法,该方法通过创建属性(位置,最高价格等)进行迭代,并返回丢失的url,但是在我主要的情况下,我称其为构造函数,并且它需要所有这些属性,因此我必须从网址中钓出它,那似乎是最重要的。
然后我尝试在主循环中添加循环。用户确定要创建多少个URL链接,并将完成的链接附加到列表中。再次遇到相同的问题。
class CraigslistScraper(object):
# Contructor of the URL that is being scraped
def __init__(self, location, postal_code, max_price, query, radius):
self.location = location # Location(i.e. City) being searched
self.postal_code = postal_code # Postal code of location being searched
self.max_price = max_price # Max price of the items that will be searched
self.query = query # Search for the type of items that will be searched
self.radius = radius # Radius of the area searched derived from the postal code given previously
self.url = f"https://{location}.craigslist.org/search/sss?&max_price={max_price}&postal={postal_code}&query={query}&20card&search_distance={radius}"
self.driver = webdriver.Chrome(r"C:\Program Files\chromedriver") # Path of Firefox web driver
self.delay = 7 # The delay the driver gives when loading the web page
# Load up the web page
# Gets all relevant data on the page
# Goes to next page until we are at the last page
def load_craigslist_url(self):
data = []
# url_list = []
self.driver.get(self.url)
while True:
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID, "searchform")))
data.append(self.extract_post_titles())
# url_list.append(self.extract_post_urls())
WebDriverWait(self.driver, 2).until(
EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform"]/div[3]/div[3]/span[2]/a[3]'))).click()
except:
break
return data
# Extracts all relevant information from the web-page and returns them as individual lists
def extract_post_titles(self):
all_posts = self.driver.find_elements_by_class_name("result-row")
dates_list = []
titles_list = []
prices_list = []
distance_list = []
for post in all_posts:
title = post.text.split("$")
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(" ")
month = title[0]
day = title[1]
title = ' '.join(title[2:])
date = month + " " + day
if not price[:1].isdigit():
price = "0"
int(price)
raw_distance = post.find_element_by_class_name(
'maptag').text
distance = raw_distance[:-2]
titles_list.append(title)
prices_list.append(price)
dates_list.append(date)
distance_list.append(distance)
return titles_list, prices_list, dates_list, distance_list
# Gets all of the url links of each listing on the page
# def extract_post_urls(self):
# soup_list = []
# html_page = urllib.request.urlopen(self.driver.current_url)
# soup = BeautifulSoup(html_page, "html.parser")
# for link in soup.findAll("a", {"class": "result-title hdrlnk"}):
# soup_list.append(link["href"])
#
# return soup_list
# Kills browser
def kill(self):
self.driver.close()
# Gets price value from dictionary and computes average
@staticmethod
def get_average(sample_dict):
price = list(map(lambda x: x['Price'], sample_dict))
sum_of_prices = sum(price)
length_of_list = len(price)
average = round(sum_of_prices / length_of_list)
return average
# Displays items around the average price of all the items in prices_list
@staticmethod
def get_items_around_average(avg, sample_dict, counter, give):
print("Items around average price: ")
print("-------------------------------------------")
raw_list = []
for z in range(len(sample_dict)):
current_price = sample_dict[z].get('Price')
if abs(current_price - avg) <= give:
raw_list.append(sample_dict[z])
final_list = raw_list[:counter]
for index in range(len(final_list)):
print('\n')
for key in final_list[index]:
print(key, ':', final_list[index][key])
# Displays nearest items to the zip provided
@staticmethod
def get_items_around_zip(sample_dict, counter):
final_list = []
print('\n')
print("Closest listings: ")
print("-------------------------------------------")
x = 0
while x < counter:
final_list.append(sample_dict[x])
x += 1
for index in range(len(final_list)):
print('\n')
for key in final_list[index]:
print(key, ':', final_list[index][key])
# Converts all_of_the_data list of dictionaries to json file
@staticmethod
def convert_to_json(sample_list):
with open(r"C:\Users\diego\development\WebScraper\data.json", 'w') as file_out:
file_out.write(json.dumps(sample_list, indent=4))
@staticmethod
def convert_to_csv(sample_list):
df = pd.DataFrame(sample_list)
df.to_csv("data.csv", index=False, header=True)
# Main where the big list data is broken down to its individual parts to be converted to a .csv file
如果名称 ==“ 主要”:
location = input("Enter the location you would like to search: ") # Location Craigslist searches
zip_code = input(
"Enter the zip code you would like to base radius off of: ") # Postal code Craigslist uses as a base for 'MILES FROM ZIP'
type_of_item = input(
"Enter the item you would like to search (ex. furniture, bicycles, cars, etc.): ") # Type of item you are looking for
max_price = input(
"Enter the max price you would like the search to use: ") # Max price Craigslist limits the items too
radius = input(
"Enter the radius you would like the search to use (based off of zip code provided earlier): ") # Radius from postal code Craigslist limits the search to
scraper = CraigslistScraper(location, zip_code, max_price, type_of_item,
radius) # Constructs the URL with the given parameters
results = scraper.load_craigslist_url() # Inserts the result of the scrapping into a large multidimensional list
titles_list = results[0][0]
prices_list = list(map(int, results[0][1]))
dates_list = results[0][2]
distance_list = list(map(float, results[0][3]))
scraper.kill()
# Merge all of the lists into a dictionary
# Dictionary is then sorted by distance from smallest -> largest
list_of_attributes = []
for i in range(len(titles_list)):
content = {'Listing': titles_list[i], 'Price': prices_list[i], 'Date posted': dates_list[i],
'Distance from zip': distance_list[i]}
list_of_attributes.append(content)
list_of_attributes.sort(key=lambda x: x['Distance from zip'])
scraper.convert_to_json(list_of_attributes)
scraper.convert_to_csv(list_of_attributes)
# scraper.export_to_mongodb()
# Below function calls:
# Get average price and prints it
# Gets/prints listings around said average price
# Gets/prints nearest listings
average = scraper.get_average(list_of_attributes)
print(f'Average price of items searched: ${average}')
num_items_around_average = int(input("How many listings around the average price would you like to see?: "))
avg_range = int(input("Range of listings around the average price: "))
scraper.get_items_around_average(average, list_of_attributes, num_items_around_average, avg_range)
print("\n")
num_items = int(input("How many items would you like to display based off of proximity to zip code?: "))
print(f"Items around you: ")
scraper.get_items_around_zip(list_of_attributes, num_items)
print("\n")
print(f"Link of listings : {scraper.url}")
我想要的是获取用户想要抓取的URL数量的程序。该输入将确定此脚本需要运行的实例数。
然后,用户将遍历每个刮板的提示,例如输入url(“您想搜索什么位置?:”)。完成创建网址后,每个抓取工具将使用其特定的网址运行,并显示回上述特定于该抓取工具的网址的三个输出。
将来,我想添加一个时间函数,然后用户确定他们希望脚本运行的频率(每小时,每天,每隔一天等)。连接到数据库,然后仅从数据库中查询平均价格范围附近的“ x”个列表,并根据特定网址的结果根据邻近度来查询“ x”个最接近的列表。
答案 0 :(得分:0)
如果您要在主循环运行时并行使用多个刮板实例,则需要使用子过程。