还设置了网站的参数

Question

因此，我有这个python脚本，用于抓取用户构建的特定craigslist URL的列表（位置，最高价格，商品类型等）。然后，它转到URL，抓取列表信息（价格，发布日期等）并返回三个输出。一种是平均价格附近的“ x”个商品数量（用户确定商品数量和价格范围，例如比平均价格低100美元）。接下来，是基于用户在乞讨中提供的邮政编码的“ x”个壁橱列表（用户还根据与邮政编码的接近程度来确定显示的项数）。最后，将craigslist网址链接输出给用户，以便他们可以访问页面并查看之前显示给他们的项目。抓取的数据存储在data.json文件和data.csv文件中。内容是相同的，只是格式不同，我想在每次抓取完成时将此数据卸载到数据库中。 Cloud Firestore或AWS DynamoDB，因为我希望将来将其托管为网络应用

我想做的是允许用户拥有相同脚本的多个实例，并且所有实例均同时运行唯一的craigslist网址。所有代码都是相同的，唯一的区别是脚本抓取的craigslist网址。

我制作了一种方法，该方法通过创建属性（位置，最高价格等）进行迭代，并返回丢失的url，但是在我主要的情况下，我称其为构造函数，并且它需要所有这些属性，因此我必须从网址中钓出它，那似乎是最重要的。

然后我尝试在主循环中添加循环。用户确定要创建多少个URL链接，并将完成的链接附加到列表中。再次遇到相同的问题。

class CraigslistScraper(object):

# Contructor of the URL that is being scraped
def __init__(self, location, postal_code, max_price, query, radius):
    self.location = location  # Location(i.e. City) being searched
    self.postal_code = postal_code  # Postal code of location being searched
    self.max_price = max_price  # Max price of the items that will be searched
    self.query = query  # Search for the type of items that will be searched
    self.radius = radius  # Radius of the area searched derived from the postal code given previously

    self.url = f"https://{location}.craigslist.org/search/sss?&max_price={max_price}&postal={postal_code}&query={query}&20card&search_distance={radius}"
    self.driver = webdriver.Chrome(r"C:\Program Files\chromedriver")  # Path of Firefox web driver
    self.delay = 7  # The delay the driver gives when loading the web page

# Load up the web page
# Gets all relevant data on the page
# Goes to next page until we are at the last page
def load_craigslist_url(self):

    data = []
    # url_list = []
    self.driver.get(self.url)
    while True:
        try:
            wait = WebDriverWait(self.driver, self.delay)
            wait.until(EC.presence_of_element_located((By.ID, "searchform")))
            data.append(self.extract_post_titles())
            # url_list.append(self.extract_post_urls())
            WebDriverWait(self.driver, 2).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform"]/div[3]/div[3]/span[2]/a[3]'))).click()
        except:
            break
    return data

# Extracts all relevant information from the web-page and returns them as individual lists
def extract_post_titles(self):

    all_posts = self.driver.find_elements_by_class_name("result-row")

    dates_list = []
    titles_list = []
    prices_list = []
    distance_list = []

    for post in all_posts:

        title = post.text.split("$")

        if title[0] == '':
            title = title[1]
        else:
            title = title[0]

        title = title.split("\n")
        price = title[0]
        title = title[-1]
        title = title.split(" ")
        month = title[0]
        day = title[1]
        title = ' '.join(title[2:])
        date = month + " " + day

        if not price[:1].isdigit():
            price = "0"
        int(price)

        raw_distance = post.find_element_by_class_name(
            'maptag').text
        distance = raw_distance[:-2]

        titles_list.append(title)
        prices_list.append(price)
        dates_list.append(date)
        distance_list.append(distance)

    return titles_list, prices_list, dates_list, distance_list

# Gets all of the url links of each listing on the page
# def extract_post_urls(self):
#     soup_list = []
#     html_page = urllib.request.urlopen(self.driver.current_url)
#     soup = BeautifulSoup(html_page, "html.parser")
#     for link in soup.findAll("a", {"class": "result-title hdrlnk"}):
#         soup_list.append(link["href"])
#
#     return soup_list

# Kills browser
def kill(self):
    self.driver.close()

# Gets price value from dictionary and computes average
@staticmethod
def get_average(sample_dict):

    price = list(map(lambda x: x['Price'], sample_dict))
    sum_of_prices = sum(price)
    length_of_list = len(price)
    average = round(sum_of_prices / length_of_list)

    return average

# Displays items around the average price of all the items in prices_list
@staticmethod
def get_items_around_average(avg, sample_dict, counter, give):
    print("Items around average price: ")
    print("-------------------------------------------")
    raw_list = []
    for z in range(len(sample_dict)):
        current_price = sample_dict[z].get('Price')
        if abs(current_price - avg) <= give:
            raw_list.append(sample_dict[z])
    final_list = raw_list[:counter]
    for index in range(len(final_list)):
        print('\n')
        for key in final_list[index]:
            print(key, ':', final_list[index][key])

# Displays nearest items to the zip provided
@staticmethod
def get_items_around_zip(sample_dict, counter):
    final_list = []
    print('\n')
    print("Closest listings: ")
    print("-------------------------------------------")
    x = 0
    while x < counter:
        final_list.append(sample_dict[x])
        x += 1
    for index in range(len(final_list)):
        print('\n')
        for key in final_list[index]:
            print(key, ':', final_list[index][key])

# Converts all_of_the_data list of dictionaries to json file
@staticmethod
def convert_to_json(sample_list):
    with open(r"C:\Users\diego\development\WebScraper\data.json", 'w') as file_out:
        file_out.write(json.dumps(sample_list, indent=4))

@staticmethod
def convert_to_csv(sample_list):
    df = pd.DataFrame(sample_list)
    df.to_csv("data.csv", index=False, header=True)


# Main where the big list data is broken down to its individual parts to be converted to a .csv file

还设置了网站的参数

如果名称 ==“ 主要”：

location = input("Enter the location you would like to search: ")  # Location Craigslist searches
zip_code = input(
    "Enter the zip code you would like to base radius off of: ")  # Postal code Craigslist uses as a base for 'MILES FROM ZIP'
type_of_item = input(
    "Enter the item you would like to search (ex. furniture, bicycles, cars, etc.): ")  # Type of item you are looking for
max_price = input(
    "Enter the max price you would like the search to use: ")  # Max price Craigslist limits the items too
radius = input(
    "Enter the radius you would like the search to use (based off of zip code provided earlier): ")  # Radius from postal code Craigslist limits the search to

scraper = CraigslistScraper(location, zip_code, max_price, type_of_item,
                            radius)  # Constructs the URL with the given parameters

results = scraper.load_craigslist_url()  # Inserts the result of the scrapping into a large multidimensional list

titles_list = results[0][0]
prices_list = list(map(int, results[0][1]))
dates_list = results[0][2]
distance_list = list(map(float, results[0][3]))

scraper.kill()

# Merge all of the lists into a dictionary
# Dictionary is then sorted by distance from smallest -> largest
list_of_attributes = []

for i in range(len(titles_list)):
    content = {'Listing': titles_list[i], 'Price': prices_list[i], 'Date posted': dates_list[i],
               'Distance from zip': distance_list[i]}
    list_of_attributes.append(content)

list_of_attributes.sort(key=lambda x: x['Distance from zip'])

scraper.convert_to_json(list_of_attributes)
scraper.convert_to_csv(list_of_attributes)
# scraper.export_to_mongodb()

# Below function calls:
# Get average price and prints it
# Gets/prints listings around said average price
# Gets/prints nearest listings

average = scraper.get_average(list_of_attributes)
print(f'Average price of items searched: ${average}')
num_items_around_average = int(input("How many listings around the average price would you like to see?: "))
avg_range = int(input("Range of listings around the average price: "))
scraper.get_items_around_average(average, list_of_attributes, num_items_around_average, avg_range)
print("\n")
num_items = int(input("How many items would you like to display based off of proximity to zip code?: "))
print(f"Items around you: ")
scraper.get_items_around_zip(list_of_attributes, num_items)
print("\n")
print(f"Link of listings : {scraper.url}")

我想要的是获取用户想要抓取的URL数量的程序。该输入将确定此脚本需要运行的实例数。

然后，用户将遍历每个刮板的提示，例如输入url（“您想搜索什么位置？：”）。完成创建网址后，每个抓取工具将使用其特定的网址运行，并显示回上述特定于该抓取工具的网址的三个输出。

将来，我想添加一个时间函数，然后用户确定他们希望脚本运行的频率（每小时，每天，每隔一天等）。连接到数据库，然后仅从数据库中查询平均价格范围附近的“ x”个列表，并根据特定网址的结果根据邻近度来查询“ x”个最接近的列表。

Answer 1

如果您要在主循环运行时并行使用多个刮板实例，则需要使用子过程。

https://docs.python.org/3/library/subprocess.html

多个Python脚本同时以不同的输入运行

还设置了网站的参数

1 个答案: