在python中编写一个类爬虫,我被困在中途。我不知道如何将新生成的链接[由app_crawler类生成]传递给“App”类,以便我可以在那里完成剩下的工作。如果有人通过展示我如何运行它指向正确的方向,我将非常有帮助。提前致谢。顺便说一句,它也在运行,但只适用于单个链接。
from lxml import html
import requests
class app_crawler:
starturl = "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8"
def crawler(self):
self.get_app(self.starturl)
def get_app(self, link):
page = requests.get(link)
tree = html.fromstring(page.text)
links = tree.xpath('//div[@class="lockup-info"]//*/a[@class="name"]/@href')
for link in links:
return link # I wish to make this link penetrate through the App class but can't get any idea
class App(app_crawler):
def __init__(self, link):
self.links = [link]
def process_links(self):
for link in self.links:
self.get_item(link)
def get_item(self, url):
page = requests.get(url)
tree = html.fromstring(page.text)
name = tree.xpath('//h1[@itemprop="name"]/text()')[0]
developer = tree.xpath('//div[@class="left"]/h2/text()')[0]
price = tree.xpath('//div[@itemprop="price"]/text()')[0]
print(name, developer, price)
if __name__ == '__main__':
parse = App(app_crawler.starturl)
parse.crawler()
parse.process_links()
我已经创建了另一个工作正常的但我想让上面的抓取工具看起来不同。以下是工作链接的链接: “https://www.dropbox.com/s/galjorcdynueequ/Working%20one.txt?dl=0”
答案 0 :(得分:2)
您的代码存在以下问题:
App
继承自app_crawler
,但您向app_crawler
提供App.__init__
个实例。
App.__init__
拨打app_crawler.__init__
而不是super().__init__()
。
不仅app_crawler.get_app
实际上没有任何,它还会创建一个全新的App
对象。
这会导致代码将app_crawler
对象传递给requests.get
而不是url字符串。
您的代码中有太多封装。
请考虑以下代码,该代码比不工作的代码短,更干净,无需不必要地传递对象:
from lxml import html
import requests
class App:
def __init__(self, starturl):
self.starturl = starturl
self.links = []
def get_links(self):
page = requests.get(self.starturl)
tree = html.fromstring(page.text)
self.links = tree.xpath('//div[@class="lockup-info"]//*/a[@class="name"]/@href')
def process_links(self):
for link in self.links:
self.get_docs(link)
def get_docs(self, url):
page = requests.get(url)
tree = html.fromstring(page.text)
name = tree.xpath('//h1[@itemprop="name"]/text()')[0]
developper = tree.xpath('//div[@class="left"]/h2/text()')[0]
price = tree.xpath('//div[@itemprop="price"]/text()')[0]
print(name, developper, price)
if __name__ == '__main__':
parse = App("https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8")
parse.get_links()
parse.process_links()
输出
Cookie Jam By Jam City, Inc. Free
Zombie Tsunami By Mobigame Free
Flow Free By Big Duck Games LLC Free
Bejeweled Blitz By PopCap Free
Juice Jam By Jam City, Inc. Free
Candy Crush Soda Saga By King Free
Bubble Witch 3 Saga By King Free
Candy Crush Jelly Saga By King Free
Farm Heroes Saga By King Free
Pet Rescue Saga By King Free
答案 1 :(得分:0)
这就是我期望我的代码应该是:
from lxml import html
import requests
class app_crawler:
starturl = "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8"
def __init__(self):
self.links = [self.starturl]
def crawler(self):
for link in self.links:
self.get_app(link)
def get_app(self, link):
page = requests.get(link)
tree = html.fromstring(page.text)
links = tree.xpath('//div[@class="lockup-info"]//*/a[@class="name"]/@href')
for link in links:
if not len(self.links)>=5:
self.links.append(link)
class App(app_crawler):
def __init__(self):
app_crawler.__init__(self)
def process_links(self):
for link in self.links:
self.get_item(link)
def get_item(self, url):
page = requests.get(url)
tree = html.fromstring(page.text)
name = tree.xpath('//h1[@itemprop="name"]/text()')[0]
developer = tree.xpath('//div[@class="left"]/h2/text()')[0]
price = tree.xpath('//div[@itemprop="price"]/text()')[0]
print(name, developer, price)
if __name__ == '__main__':
scrape = App()
scrape.crawler()
scrape.process_links()