Question

这里有一个新手刮刀！

我目前沉迷于繁琐无聊的任务，我必须从天使列表中复制/粘贴某些内容并将其保存在excel中。我之前使用过刮刀来自动执行这些无聊的任务，但是这个很难，我无法找到自动化的方法。请在下面的网站链接中找到：

https://angel.co/people/all

请使用过滤器位置 - ＆gt;美国和市场 - ＆gt;在线约会。将有大约550个结果（请注意，当您应用过滤器时，URL不会发生变化）

应用过滤器后，我已成功删除了所有配置文件的网址。因此，我有一个包含这些配置文件的550个URL的excel文件。

现在，下一步是转到个人资料并废弃某些信息。我目前正在寻找这些领域：

名称
描述信息
投资
方正
顾问
位置
市场
我正在寻找什么

现在我已经尝试了很多解决方案但到目前为止还没有解决过。 Import.io，数据挖掘器，数据抓取工具对我没什么帮助。

请建议是否有任何VBA代码或Python代码或任何可以帮助我自动执行此抓取任务的工具？

完整的解决方案代码：

以下是带注释的最终代码。如果有人仍有问题，请在下方发表评论，我会尽力帮助您。

from bs4 import BeautifulSoup
import urllib2
import json
import csv

def fetch_page(url):
    opener = urllib2.build_opener()
    # changing the user agent as the default one is banned
    opener.addheaders = [('User-Agent', 'Mozilla/43.0.1')]
    return opener.open(url).read()


#Create a CSV File.
f = open('angle_profiles.csv', 'w')
# Row Headers
f.write("URL" + "," + "Name" + "," + "Founder" + "," + "Advisor" + "," + "Employee" + "," + "Board Member" + ","
    + "Customer" + "," + "Locations" + "," + "Markets" + "," + "Investments" + "," + "What_iam_looking_for" + "\n")

# URLs to iterate over has been saved in file: 'profiles_links.csv' . I will extract the URLs individually...
index = 1;
with open("profiles_links.csv") as f2:

    for row in map(str.strip,f2):
        url = format(row)
        print "@ Index: ", index
        index += 1;

        # Check if URL has 404 error. if yes, skip and continue with the rest of URLs.
        try:
            html = fetch_page(url)
            page = urllib2.urlopen(url)
        except Exception, e:
            print "Error 404 @: " , url
            continue

        bs = BeautifulSoup(html, "html.parser")

        #Extract info from page with these tags..
        name = bs.select(".profile-text h1")[0].get_text().strip()

        #description = bs.select('div[data-field="bio"]')[0]['data-value']

        founder = map(lambda link: link.get_text().strip(), bs.select('.role_founder a'))

        advisor = map(lambda link: link.get_text().strip(), bs.select('.role_advisor a'))

        employee = map(lambda link: link.get_text().strip(), bs.select('.role_employee a'))

        board_member = map(lambda link: link.get_text().strip(), bs.select('.role_board_member a'))

        customer = map(lambda link: link.get_text().strip(), bs.select('.role_customer a'))

        class_wrapper = bs.body.find('div', attrs={'data-field' : 'tags_interested_locations'})
        count = 1
        locations = {}

        if class_wrapper is not None:
            for span in class_wrapper.find_all('span'):
                locations[count] = span.text
                count +=1

        class_wrapper = bs.body.find('div', attrs={'data-field' : 'tags_interested_markets'})
        count = 1
        markets = {}
        if class_wrapper is not None:
            for span in class_wrapper.find_all('span'):
                markets[count] = span.text
                count +=1

        what_iam_looking_for = ' '.join(map(lambda p: p.get_text().strip(), bs.select('div.criteria p')))

        user_id = bs.select('.profiles-show .profiles-show')[0]['data-user_id']

        # investments are loaded using separate request and response is in JSON format
        json_data = fetch_page("https://angel.co/startup_roles/investments?user_id=%s" % user_id)

        investment_records = json.loads(json_data)

        investments = map(lambda x: x['company']['company_name'], investment_records)

        # Make sure that every variable is in string

        name2 = str(name); founder2 = str(founder); advisor2 = str (advisor); employee2 = str(employee)
        board_member2 = str(board_member); customer2 = str(customer); locations2 = str(locations); markets2 = str (markets);
        what_iam_looking_for2 = str(what_iam_looking_for); investments2 = str(investments);

        # Replace any , found with - so that csv doesn't confuse it as col separator...
        name = name2.replace(",", " -")
        founder = founder2.replace(",", " -")
        advisor = advisor2.replace(",", " -")
        employee = employee2.replace(",", " -")
        board_member = board_member2.replace(",", " -")
        customer = customer2.replace(",", " -")
        locations = locations2.replace(",", " -")
        markets = markets2.replace(",", " -")
        what_iam_looking_for = what_iam_looking_for2.replace(","," -")
        investments = investments2.replace(","," -")

        # Replace u' with nothing
        name = name.replace("u'", "")
        founder = founder.replace("u'", "")
        advisor = advisor.replace("u'", "")
        employee = employee.replace("u'", "")
        board_member = board_member.replace("u'", "")
        customer = customer.replace("u'", "")
        locations = locations.replace("u'", "")
        markets = markets.replace("u'", "")
        what_iam_looking_for = what_iam_looking_for.replace("u'", "")
        investments = investments.replace("u'", "")

        # Write the information back to the file... Note \n is used to jump one row ahead...
        f.write(url + "," + name + "," + founder + "," + advisor + "," + employee + "," + board_member + ","
                + customer + "," + locations + "," + markets + "," + investments + "," + what_iam_looking_for + "\n")

可以使用以下任意链接测试上述代码：

https://angel.co/idg-ventures?utm_source=people
https://angel.co/douglas-feirstein?utm_source=people
https://angel.co/andrew-heckler?utm_source=people
https://angel.co/mvklein?utm_source=people
https://angel.co/rajs1?utm_source=people

快乐编码：）

Answer 1

对于我的食谱，您需要使用pip或easy_install

安装BeautifulSoup

from bs4 import BeautifulSoup
import urllib2
import json

def fetch_page(url):
    opener = urllib2.build_opener()
    # changing the user agent as the default one is banned
    opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
    return opener.open(url).read()


html = fetch_page("https://angel.co/davidtisch")

# or load from local file
#html = open('page.html', 'r').read()

bs = BeautifulSoup(html, "html.parser")
name = bs.select(".profile-text h1")[0].get_text().strip()

description = bs.select('div[data-field="bio"]')[0]['data-value']

founder = map(lambda link: link.get_text().strip(), bs.select('.role_founder a'))

advisor = map(lambda link: link.get_text().strip(), bs.select('.role_advisor a'))

locations = map(lambda link: link.get_text().strip(), bs.select('div[data-field="tags_interested_locations"] a'))

markets = map(lambda link: link.get_text().strip(), bs.select('div[data-field="tags_interested_markets"] a'))

what_iam_looking_for = ' '.join(map(lambda p: p.get_text().strip(), bs.select('div.criteria p')))

user_id = bs.select('.profiles-show .profiles-show')[0]['data-user_id']

# investments are loaded using separate request and response is in JSON format
json_data = fetch_page("https://angel.co/startup_roles/investments?user_id=%s" % user_id)

investment_records = json.loads(json_data)

investments = map(lambda x: x['company']['company_name'], investment_records)

Answer 2

查看https://scrapy.org/

它允许非常快速地编写解析器。这是我的一个网站的示例解析器，类似于angel.co：https://gist.github.com/lisitsky/c4aac52edcb7abfd5975be067face1bb

很遗憾，我现在无法使用angel.co。好点开始：

$ pip install scrapy
$ cat > myspider.py <<EOF

import scrapy

class BlogSpider(scrapy.Spider):
    name = 'blogspider'
    start_urls = ['https://angel.co']

    def parse(self, response):
        # here's selector to extract interesting elements
        for title in response.css('h2.entry-title'):
            # write down here values you'd like to extract from the element
            yield {'title': title.css('a ::text').extract_first()}

        # how to find next page
        next_page = response.css('div.prev-post > a ::attr(href)').extract_first()
        if next_page:
            yield scrapy.Request(response.urljoin(next_page), callback=self.parse)

EOF

$ scrapy runspider myspider.py

输入有趣的css选择器并运行spider。

使用Python Beatifulsoup的废料天使列表配置文件描述

2 个答案: