取消搜索栏的定价-网站链接已更改

时间:2020-06-17 20:36:31

标签: python json web-scraping request

在这里的一些专家的帮助下,我能够制造出效果很好的刮板。

必不可少的代码行实际上是:

data = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17155"}}
r = requests.post('https://www.partssource.com/catalog/Service', json=data).json()"

但是该站点最近将其链接从partsfinder.com更改为partssource.com,并且该代码似乎可以正常工作。

只是想知道我是否可以在原始代码上使用一个技巧来使它再次工作。

感谢任何想法,谢谢!

import requests
import pandas as pd


df = pd.read_excel(r'C:\Users\212677036\Documents\Part Number Input.xlsx')
PN = pd.DataFrame(df, columns=['Product code'])
9
i = 0

Total_rows = PN.shape[0]
partnumlist = []
partnumlist1 = []
partnumlist2 = []
partnumlist3 = []
partnumlist4 = []
partnumlist5 = []

while i < Total_rows:

         data = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17155"}}
         r = requests.post('https://www.partssource.com/catalog/Service', json=data).json()

         partnumlist.append(r['Data']['PartOptions'][0]['YourPrice'])

         data1 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17475"}}
         r1 = requests.post('https://www.partssource.com/catalog/Service', json=data1).json()

         partnumlist1.append(r1['Data']['PartOptions'][0]['YourPrice'])

         data2 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "16880"}}
         r2 = requests.post('https://www.partssource.com/catalog/Service', json=data2).json()

         partnumlist2.append(r2['Data']['PartOptions'][0]['YourPrice'])

         data3 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "47221"}}
         r3 = requests.post('https://www.partssource.com/catalog/Service', json=data3).json()

         partnumlist3.append(r3['Data']['PartOptions'][0]['YourPrice'])

         data4 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17045"}}
         r4 = requests.post('https://www.partssource.com/catalog/Service', json=data4).json()

         partnumlist4.append(r4['Data']['PartOptions'][0]['YourPrice'])

         data5 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17055"}}
         r5 = requests.post('https://www.partssource.com/catalog/Service', json=data5).json()

         partnumlist5.append(r5['Data']['PartOptions'][0]['YourPrice'])

         i=i+1

list_of_dataframes = [pd.DataFrame(partnumlist),pd.DataFrame(partnumlist1),
               pd.DataFrame(partnumlist2), pd.DataFrame(partnumlist3),
               pd.DataFrame(partnumlist4), pd.DataFrame(partnumlist5)]

pd.concat(list_of_dataframes).to_csv(r'C:\Users\212677036\Documents\output25.csv')

1 个答案:

答案 0 :(得分:1)

DevTools / Firefox中使用Chrome之后,我创建了此代码。

页面使用不同的url,发送不同的数据,使用不同的键获取结果。

您将不得不使用DevTools来观察浏览器到服务器的更多请求,才能识别出如何在data中使用更多参数

import requests

query = "mobile"

data = {
#    "facets":[{
#        "name":"OEM",
#        "value":"GE%20Healthcare"
#    }],
    "facets":[],    
    "facilityId": 38451,
    "id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
    "limit": 15,
    "query": query,
    "referer": "/catalog/Service",
    "start": 0,
#    "urlParams":[{
#        "name": "OEM",
#        "value": "GE Healthcare"
#    }],
    "urlParams":[]    
}

r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=data)
data = r.json()

#print(data['products'])
#print(data['products'][0])
#print(data['products'][0]['options'])
#print(data['products'][0]['options'][0])

print(data['products'][0]['options'][0]['price'])

编辑(2020.09.01)

如果您有手动查询,请使用for循环多次运行相同的代码,但使用不同的查询。而且,当您获得一个查询的数据时,请使用for循环从data['products']中获取所有价格

编辑(2020.09.06)

我在start中添加了变量limitget_data(),后来又在循环for start in range(0, limit*10, limit)中运行它以获取10页(每页包含100个元素)

import requests
# import pprint  # to format data on screen `pprint.pprint()

# --- fucntions ---

def get_data(query, start=0, limit=15): # <-- new (2020.09.06)
    """Get data from server"""
    
    payload = {
    #    "facets":[{
    #        "name":"OEM",
    #        "value":"GE%20Healthcare"
    #    }],
        "facets":[],    
        "facilityId": 38451,
        "id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
        "limit": limit,  # <-- new (2020.09.06)
        "query": query,
        "referer": "/catalog/Service",
        "start": start,  # <-- new (2020.09.06)
    #    "urlParams":[{
    #        "name": "OEM",
    #        "value": "GE Healthcare"
    #    }],
        "urlParams":[]    
    }

    r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
    data = r.json()
    
    return data

def show_data(data):
    #print(data['products'])
    #print(data['products'][0])
    #print(data['products'][0]['options'])
    #print(data['products'][0]['options'][0])

    print(data['products'][0]['options'][0]['price'])

    for item in data['products']:
        #pprint.pprint(item)
        print('title:', item['title'])
    
        if not item['options']:
            print('price: unknown')
        else:
            for option in item['options']:
                print('price:', option['price'], '| vendor item number:', option['vendorItemNumber'])

        print('---')
    
def filter_data(data):
    filtered = []
    
    for item in data['products']:
        if not item['options']:
            filtered.append( [] )  # unknown
        else:
            all_prices = [option['price'] for option in item['options']]
            filtered.append( all_prices )
            
    return filtered
    
# --- main ---

all_queries = ["mobile", 'GE Healthcare']

limit = 100  # <-- new (2020.09.06)

for query in all_queries:

    # pagination
    for start in range(0, limit*10, limit): # <-- new (2020.09.06)

        print('\n--- QUERY:', query, 'start:', start, '---\n')

        data = get_data(query, start, limit)
        #show_data(data)

        filtered = filter_data(data)
        print(filtered)
相关问题