from bs4 import BeautifulSoup
import requests
import pandas as pd
records = []
keep_looping = True
url = 'https://reelgood.com/source/netflix'
while keep_looping:
r = requests.get(url)
soup = BeautifulSoup(r.text,'html.parser')
title = soup.find_all('tr',attrs={'class':'cM'})
for t in title:
movie = t.find(attrs={'class':'cI'}).text
year = t.find(attrs={'class':'cJ'}).findNext('td').text
rating = t.find(attrs={'class':'cJ'}).findNext('td').findNext('td').text
score = t.find(attrs={'class':'cJ'}).findNext('td').findNext('td').findNext('td').text
rottenTomatoe = t.find(attrs={'class':'cJ'}).findNext('td').findNext('td').findNext('td').findNext('td').text
episodes = t.find(attrs={'class':'c0'}).text[:3]
records.append([movie, year, rating, score, rottenTomatoe, episodes])
url_tag = soup.find('a',attrs={'class':'eH'})
if not url_tag:
keep_looping = False
else:
url = "https://www.reelgood.com" + url_tag.get('href')
以上代码从我的网页获取了所有数据:https://reelgood.com/source/netflix
我想要这些数据然后我想要删除有关每部电影的其他数据,这些数据似乎是专门为每部电影存储的。这就是我想从中获取数据的网址:
https://reelgood.com/movie/thor-ragnarok-2017
(正如您所看到的,它将电影标题添加到网址中),但我想为每部电影获取此数据。我想从这个网页上抓取的数据是流派(动作和冒险,喜剧等)。
有关如何实现这一目标的任何想法?
感谢您的帮助,因为这是我的第一个独立项目网络报废,所以如果这是常识,我将从这里学习。
答案 0 :(得分:1)
首先,您可以从主要电影列表中找到网址,然后对每个网址进行迭代,以找到第一页上html
表中无法找到的其他信息:
import requests, re
from bs4 import BeautifulSoup as soup
from typing import NamedTuple
class Movie(NamedTuple):
year:int
name:str
imdb_rating:str
rotten_tomatoes_rating:str
tags:list
age:str
description:str
start = soup(requests.get('https://reelgood.com/source/netflix').text, 'html.parser')
links = [f"https://reelgood.com{i['href']}" for i in start.find_all('a') if re.findall('^/movie/', i['href'])]
new_results = [soup(requests.get(i).text, 'html.parser') for i in links]
extra = iter(filter(lambda x:re.findall('^\d+$', x), [i.text for i in start.find_all('td')]))
tags = [['h1', {'itemprop':'name'}, True], ['div', {'class':'eC'}, True], ['div', {'class':'eD'}, True], ['a', {'class':'eA'}, False], ['span', {'title':'Maturity rating'}, True], ['p', {'itemprop':'description'}, True]]
final_results = [[(lambda x:[getattr(h, 'text','N/A') for h in x] if isinstance(x, list) else getattr(x, 'text', 'N/A'))(getattr(b, ['find_all', 'find'][method])(tag, c)) for tag, c, method in tags] for b in new_results]
d = [Movie(*[int(next(extra)), *i]) for i in final_results]
输出:
[Movie(year=2017, name='Thor: Ragnarok', imdb_rating='7.9/10', rotten_tomatoes_rating='92%', tags=['Action & Adventure', 'Fantasy', 'Based on Comic', 'Sequel', 'Superhero'], age='13+', description='Thor is on the other side of the universe and finds himself in a race against time to get back to Asgard to stop Ragnarok, the prophecy of destruction to his homeworld and the end of Asgardian civilization, at the hands of an all-powerful new threat, the ruthless Hela.'), Movie(year=2010, name='Thor: Ragnarok', imdb_rating='7.9/10', rotten_tomatoes_rating='92%', tags=['Action & Adventure', 'Fantasy', 'Based on Comic', 'Sequel', 'Superhero'], age='13+', description='Thor is on the other side of the universe and finds himself in a race against time to get back to Asgard to stop Ragnarok, the prophecy of destruction to his homeworld and the end of Asgardian civilization, at the hands of an all-powerful new threat, the ruthless Hela.'), Movie(year=2017, name='Coco', imdb_rating='8.5/10', rotten_tomatoes_rating='97%', tags=['Animation', 'Action & Adventure', 'Music'], age='7+', description="Despite his family’s baffling generations-old ban on music, Miguel dreams of becoming an accomplished musician like his idol, Ernesto de la Cruz. Desperate to prove his talent, Miguel finds himself in the stunning and colorful Land of the Dead following a mysterious chain of events. Along the way, he meets charming trickster Hector, and together, they set off on an extraordinary journey to unlock the real story behind Miguel's family history."), Movie(year=2015, name='Coco', imdb_rating='8.5/10', rotten_tomatoes_rating='97%', tags=['Animation', 'Action & Adventure', 'Music'], age='7+', description="Despite his family’s baffling generations-old ban on music, Miguel dreams of becoming an accomplished musician like his idol, Ernesto de la Cruz. Desperate to prove his talent, Miguel finds himself in the stunning and colorful Land of the Dead following a mysterious chain of events. Along the way, he meets charming trickster Hector, and together, they set off on an extraordinary journey to unlock the real story behind Miguel's family history."), Movie(year=1999, name='Guardians of the Galaxy Vol. 2', imdb_rating='7.7/10', rotten_tomatoes_rating='83%', tags=['Action & Adventure', 'Science-Fiction', 'Based on Comic', 'Sequel', 'Space', 'Superhero'], age='13+', description="The Guardians must fight to keep their newfound family together as they unravel the mysteries of Peter Quill's true parentage."), Movie(year=2014, name='Guardians of the Galaxy Vol. 2', imdb_rating='7.7/10', rotten_tomatoes_rating='83%', tags=['Action & Adventure', 'Science-Fiction', 'Based on Comic', 'Sequel', 'Space', 'Superhero'], age='13+', description="The Guardians must fight to keep their newfound family together as they unravel the mysteries of Peter Quill's true parentage."), Movie(year=2005, name='Pirates of the Caribbean: Dead Men Tell No Tales', imdb_rating='6.6/10', rotten_tomatoes_rating='30%', tags=['Action & Adventure', 'Comedy', 'Ghost', 'Sequel'], age='13+', description="Thrust into an all-new adventure, a down-on-his-luck Capt. Jack Sparrow feels the winds of ill-fortune blowing even more strongly when deadly ghost sailors led by his old nemesis, the evil Capt. Salazar, escape from the Devil's Triangle. Jack's only hope of survival lies in seeking out the legendary Trident of Poseidon, but to find it, he must forge an uneasy alliance with a brilliant and beautiful astronomer and a headstrong young man in the British navy."), Movie(year=1999, name='Pirates of the Caribbean: Dead Men Tell No Tales', imdb_rating='6.6/10', rotten_tomatoes_rating='30%', tags=['Action & Adventure', 'Comedy', 'Ghost', 'Sequel'], age='13+', description="Thrust into an all-new adventure, a down-on-his-luck Capt. Jack Sparrow feels the winds of ill-fortune blowing even more strongly when deadly ghost sailors led by his old nemesis, the evil Capt. Salazar, escape from the Devil's Triangle. Jack's only hope of survival lies in seeking out the legendary Trident of Poseidon, but to find it, he must forge an uneasy alliance with a brilliant and beautiful astronomer and a headstrong young man in the British navy."), Movie(year=2005, name='Captain America: Civil War', imdb_rating='7.8/10', rotten_tomatoes_rating='91%', tags=['Action & Adventure', 'Science-Fiction', 'Based on Comic', 'Sequel', 'Superhero', 'War'], age='13+', description='Following the events of Age of Ultron, the collective governments of the world pass an act designed to regulate all superhuman activity. This polarizes opinion amongst the Avengers, causing two factions to side with Iron Man or Captain America, which causes an epic battle between former allies.'), Movie(year=2017, name='Captain America: Civil War', imdb_rating='7.8/10', rotten_tomatoes_rating='91%', tags=['Action & Adventure', 'Science-Fiction', 'Based on Comic', 'Sequel', 'Superhero', 'War'], age='13+', description='Following the events of Age of Ultron, the collective governments of the world pass an act designed to regulate all superhuman activity. This polarizes opinion amongst the Avengers, causing two factions to side with Iron Man or Captain America, which causes an epic battle between former allies.'), Movie(year=1994, name='Doctor Strange', imdb_rating='7.5/10', rotten_tomatoes_rating='89%', tags=['Action & Adventure', 'Fantasy', 'Based on Comic', 'Doctor', 'Magic', 'Superhero'], age='13+', description='After his career is destroyed, a brilliant but arrogant surgeon gets a new lease on life when a sorcerer takes him under his wing and trains him to defend the world against evil.'), Movie(year=2017, name='Doctor Strange', imdb_rating='7.5/10', rotten_tomatoes_rating='89%', tags=['Action & Adventure', 'Fantasy', 'Based on Comic', 'Doctor', 'Magic', 'Superhero'], age='13+', description='After his career is destroyed, a brilliant but arrogant surgeon gets a new lease on life when a sorcerer takes him under his wing and trains him to defend the world against evil.'), Movie(year=2012, name="Pirates of the Caribbean: Dead Man's Chest", imdb_rating='7.3/10', rotten_tomatoes_rating='54%', tags=['Action & Adventure', 'Fantasy', 'Monster'], age='13+', description='Captain Jack Sparrow works his way out of a blood debt with the ghostly Davey Jones, he also attempts to avoid eternal damnation.'), Movie(year=2013, name="Pirates of the Caribbean: Dead Man's Chest", imdb_rating='7.3/10', rotten_tomatoes_rating='54%', tags=['Action & Adventure', 'Fantasy', 'Monster'], age='13+', description='Captain Jack Sparrow works his way out of a blood debt with the ghostly Davey Jones, he also attempts to avoid eternal damnation.'), Movie(year=2013, name='The Imitation Game', imdb_rating='8/10', rotten_tomatoes_rating='90%', tags=['Biography', 'Gay & Lesbian', 'Biography', 'World War II'], age='13+', description="Based on the real life story of legendary cryptanalyst Alan Turing, the film portrays the nail-biting race against time by Turing and his brilliant team of code-breakers at Britain's top-secret Government Code and Cypher School at Bletchley Park, during the darkest days of World War II."), Movie(year=1974, name='The Imitation Game', imdb_rating='8/10', rotten_tomatoes_rating='90%', tags=['Biography', 'Gay & Lesbian', 'Biography', 'World War II'], age='13+', description="Based on the real life story of legendary cryptanalyst Alan Turing, the film portrays the nail-biting race against time by Turing and his brilliant team of code-breakers at Britain's top-secret Government Code and Cypher School at Bletchley Park, during the darkest days of World War II.")]
现在,可以找到与每部电影相关的标签:
print([i.tags for i in d])
输出:
[['Action & Adventure', 'Fantasy', 'Based on Comic', 'Sequel', 'Superhero'], ['Action & Adventure', 'Fantasy', 'Based on Comic', 'Sequel', 'Superhero'], ['Animation', 'Action & Adventure', 'Music'], ['Animation', 'Action & Adventure', 'Music'], ['Action & Adventure', 'Science-Fiction', 'Based on Comic', 'Sequel', 'Space', 'Superhero'], ['Action & Adventure', 'Science-Fiction', 'Based on Comic', 'Sequel', 'Space', 'Superhero'], ['Action & Adventure', 'Comedy', 'Ghost', 'Sequel'], ['Action & Adventure', 'Comedy', 'Ghost', 'Sequel'], ['Action & Adventure', 'Science-Fiction', 'Based on Comic', 'Sequel', 'Superhero', 'War'], ['Action & Adventure', 'Science-Fiction', 'Based on Comic', 'Sequel', 'Superhero', 'War'], ['Action & Adventure', 'Fantasy', 'Based on Comic', 'Doctor', 'Magic', 'Superhero'], ['Action & Adventure', 'Fantasy', 'Based on Comic', 'Doctor', 'Magic', 'Superhero'], ['Action & Adventure', 'Fantasy', 'Monster'], ['Action & Adventure', 'Fantasy', 'Monster'], ['Biography', 'Gay & Lesbian', 'Biography', 'World War II'], ['Biography', 'Gay & Lesbian', 'Biography', 'World War II']]