scrapy,网址列表,python,熊猫

时间:2017-02-25 18:43:01

标签: python csv scrapy

你好stackoverflow社区,请帮帮我,我的代码是:{

import scrapy  
import pandas as pd 
class QuotesSpider(scrapy.Spider):   
    organization=pd.read_csv("/home/jihane/Téléchargements/odm.csv/organizations.csv")   
    data = organization.twitter_url.tolist()    
    def start_requests(self):       
        urls = data        
        for url in urls:          
        yield scrapy.Request(url=url, callback=self.parse)     
    def parse(self, response):        
        y=dict()      
        page=response.url       
        societe= response.url.split("/")[-1]        
        y["url"]=page        
        y["name"]=societe        
        for t, v in zip(response.css("span.ProfileNavlabel::text"),response.css("span.Profile-Nav-value::text")):   
            t= t.extract()
            v= v.extract()             
            y[t]=v         
            print(y)

}

我想要使用csv文件列中的列表的部分给出了错误,我还想知道如何将字典转换为数据帧。谢谢你太多了

1 个答案:

答案 0 :(得分:0)

这段代码对我有用;具有相同结构的网址列表

 import scrapy
    class QuotesSpider(scrapy.Spider):   
        name="popularity" 
        def start_requests(self): 
            with open('csvfile', 'rb') as f:
                list=[] 
                for line in f.readlines():
                    array = line.split(',')
                    url = array[9]
                    list.append(url) 
                list.pop(0)   
            for url in list:  
                if url != "":
                    yield scrapy.Request(url=url, callback=self.parse) 

        def parse(self, response):   
            y={}  
            page=response.url       
            societe= response.url.split("/")[-1] 
            y={"url":page,"name":societe} #url and name of the entreprise
            for t, v in zip(response.css("span.ProfileNav-label::text"),response.css("span.ProfileNav-value::text")):  
                t = t.extract() #label
                v = v.extract() #valeur label
                y[t]=v
            yield y