Python提取数据并将其附加到数据框

时间:2018-07-02 11:37:36

标签: python pandas loops dataframe export

我已将网站刮去进行研究,但找不到将其提取到数据框中的正确方法。我相信我的问题与第36和38行之间的列表对象有关。

打印行效果非常好,我可以在Python控制台中看到数据框的最终版本。

解决方案可能非常简单,但我无法弄清楚。预先感谢您的所有帮助。

from time import sleep
from bs4 import BeautifulSoup, SoupStrainer
import requests
import pandas as pd

# Insert the hisghest page number for website
highest_number = 12


def total_page_number(url):
    all_webpage_links = []
    all_webpage_links.insert(0, url)
    pages = [str(each_number) for each_number in range(2, highest_number)]
    for page in pages:
        link = ''.join(url + '&page=' + page)
        all_webpage_links.append(link)
    return all_webpage_links


# Use total_page_number function to create page list for website
All_page = total_page_number(
    'https://www.imdb.com/search/title?countries=tr&languages=tr&locations=Turkey&count=250&view=simple')


def clean_text(text):
    """ Removes white-spaces before, after, and between characters
    :param text: the string to remove clean
    :return: a "cleaned" string with no more than one white space between
    characters
    """
    return ' '.join(text.split())


# Create list objects for data
# Problem occurs in this line !!!!!!
actor_names = []
titles = []
dates = []


def get_cast_from_link(movie_link):
    """ Go to the IMDb Movie page in link, and find the cast overview list.
        Prints tab-separated movie_title, actor_name, and character_played to
        stdout as a result. Nothing returned
    :param movie_link: string of the link to IMDb movie page (http://imdb.com
    ...)
    :return: void
    """
    movie_page = requests.get(movie_link)

    # Use SoupStrainer to strain the cast_list table from the movie_page
    # This can save some time in bigger scraping projects
    cast_strainer = SoupStrainer('table', class_='cast_list')
    movie_soup = BeautifulSoup(movie_page.content, 'html.parser', parse_only=cast_strainer)
    # Iterate through rows and extract the name and character
    # Remember that some rows might not be a row of interest (e.g., a blank
    # row for spacing the layout). Therefore, we need to use a try-except
    # block to make sure we capture only the rows we want, without python
    # complaining.
    for row in movie_soup.find_all('tr'):
        try:
            actor = clean_text(row.find(itemprop='name').text)
            actor_names.append(actor)
            titles.append(movie_title)
            dates.append(movie_date)
            print('\t'.join([movie_title, actor, movie_date]))
        except AttributeError:
            pass


# Export data frame
# Problem occurs in this line !!!!!!
tsd_df = pd.DataFrame({'Actor_Names': actor_names,
                       'Movie_Title': titles,
                       'Movie_Date': dates})
tsd_df.to_csv('/Users/ea/Desktop/movie_df.tsv', encoding='utf-8')

for each in All_page:
    # Use requests.get('url') to load the page you want
    web_page = requests.get(each)
    # https://www.imdb.com/search/title?countries=tr&languages=tr&count=250&view=simple&page=2

    # Prepare the SoupStrainer to strain just the tbody containing the list of movies
    list_strainer = SoupStrainer('div', class_='lister-list')

    # Parse the html content of the web page with BeautifulSoup
    soup = BeautifulSoup(web_page.content, 'html.parser', parse_only=list_strainer)

    # Generate a list of the "Rank & Title" column of each row and iterate
    movie_list = soup.find_all('span', class_='lister-item-header')
    for movie in movie_list:
        movie_title = movie.a.text
        movie_date = movie.find('span', class_='lister-item-year text-muted unbold').text

        # get the link to the movie's own IMDb page, and jump over
        link = 'http://imdb.com' + movie.a.get('href')

        get_cast_from_link(link)
        # remember to be nice, and sleep a while between requests!
        sleep(15)

0 个答案:

没有答案