Question

我想通过从api的每个页面获取数据来构建数据帧（每页限制100行）。目前，下面的代码返回所有数据，但结构错误。

有17个标题，因此我需要17列中的数据。但是，它输出的数据帧为[100行x 1700列]，我需要[10000行x 17列]。

我不确定如何实现这一目标 - 我们将非常感谢任何帮助。

from ebaysdk.finding import Connection as finding
from bs4 import BeautifulSoup
import pandas as pd

x = []

for i in range(1,101):
    print(type(i))
    api = finding(siteid='EBAY-GB',appid='some_id',config_file=None)

    response = api.execute('findItemsByKeywords', {'keywords': 'phone', 'outputSelector' : 'SellerInfo',
    'paginationInput': {'entriesPerPage': '2','pageNumber': ' '+str(i)}})    

    soup = BeautifulSoup(response.content, 'lxml')

    items = soup.find_all('item')

    headers = ['itemid','title','categoryname','categoryid','postalcode','location','sellerusername','feedbackscore','positivefeedbackpercent','topratedseller','shippingservicecost','buyitnowavailable','currentprice','starttime','endtime','watchcount','conditionid']

    for object in headers:
        values = [element.text for element in soup.find_all(object)]
        x.append(values)
        df = pd.DataFrame(x)
        df = df.T
    print(x)
#[['152668959069', '252999725410'], ['Samsung GALAXY Ace GT-S5830i (Unlocked) Smartphone Android Phone- ALL COLOURS UK', '8GB 3G Unlocked Android 5.1 Quad Core Smartphone Mobile Phone 2 SIM GPS qHD'], ['Mobile & Smart Phones', 'Mobile & Smart Phones'], ['9355', '9355'], ['RM137PP'], ['Rainham,United Kingdom', 'United Kingdom'], ['deals4u_shop', 'smartlife2017'], ['15700', '456'], ['99.9', '98.5'], ['true', 'true'], ['0.0', '0.0'], ['false', 'false'], ['32.49', '48.9'], ['2017-08-18T18:36:28.000Z', '2017-06-19T09:04:40.000Z'], ['2017-12-16T18:36:28.000Z', '2017-12-16T09:04:40.000Z'], ['272', '134'], ['1000', '1000']]

    print(df)
             0                                                  1   \
0  152668959069  Samsung GALAXY Ace GT-S5830i (Unlocked) Smartp...   
1  252999725410  8GB 3G Unlocked Android 5.1 Quad Core Smartpho...   

                      2     3        4                       5   \
0  Mobile & Smart Phones  9355  RM137PP  Rainham,United Kingdom   
1  Mobile & Smart Phones  9355     None          United Kingdom   

              6      7     8     9   ...    24    25    26   27     28    29  \
0   deals4u_shop  15700  99.9  true  ...   456  98.5  true  0.0  false  48.9   

1  smartlife2017    456  98.5  true  ...   456  98.5  true  0.0  false  48.9   

                         30                        31   32    33  
0  2017-06-19T09:04:40.000Z  2017-12-16T09:04:40.000Z  214  1000  
1  2017-06-19T09:04:40.000Z  2017-12-16T09:04:40.000Z  182  1000

编辑：为第一页的前2个条目添加更多代码并打印x，为2页的前2个条目添加df。

Answer 1

这应该会更好。

字典理解版：

data_dict = {obj: [element.text for element in soup.find_all(obj)] for obj in headers}    
df = pd.DataFrame(data_dict)

循环版本：

data_dict = {}
for obj in headers:
    data_dict[obj] = [element.text for element in soup.find_all(obj)]

df = pd.DataFrame(data_dict)

Answer 2

考虑迭代地附加到具有最终串联的数据帧列表：

...
df_list = []
api = finding(siteid='EBAY-GB',appid='some_id',config_file=None)

for i in range(1,101):
    print(i)
    response = api.execute('findItemsByKeywords', 
                           {'keywords': 'phone',
                            'outputSelector' : 'SellerInfo',
                            'paginationInput': {'entriesPerPage': '2',
                                                'pageNumber': ' '+str(i)}})    

    soup = BeautifulSoup(response.content, 'lxml')

    headers = ['itemid','title','categoryname','categoryid','postalcode','location',
               'sellerusername','feedbackscore','positivefeedbackpercent','topratedseller',
               'shippingservicecost','buyitnowavailable','currentprice','starttime',
               'endtime','watchcount','conditionid']

    # LIST COMPREHENSION PARSING ELEMENTS OF API RESPONSE
    values = [element.text for element in soup.find_all(obj) for obj in headers]

    # DICT COMPREHENSION WITH ZIP TO DF THAT NAMES EACH COLUMN WITH VALUE & FILLS MISSING
    tmp = pd.DataFrame({h:v if len(v) > 1 else v+[None] for h,v in zip(headers, values)})

    # APPENDS TO LIST
    df_list.append(tmp)

# ROW BINDS TO FINAL DF
final_df = pd.concat(df_list, ignore_index=True)

Answer 3

from ebaysdk.finding import Connection as finding
from bs4 import BeautifulSoup
import pandas as pd

def flatten(lst):
   for x in lst:
      if isinstance(x, list):
         for y in flatten(x):
            yield y           
      else:
            yield x

full_dict = {}
result = {}

for i in range(1,101):
print(i)

    api = finding(siteid='EBAY-GB',appid='some key',config_file=None)
    response = api.execute('findItemsByKeywords', {'keywords': 'phone', 'outputSelector' : 'SellerInfo',
'paginationInput': {'entriesPerPage': '100','pageNumber': ' '+str(i)}})    

    soup = BeautifulSoup(response.content, 'lxml')

    items = soup.find_all('item')

    headers_tuple = ('itemid','title','categoryname','categoryid','postalcode','location','sellerusername','feedbackscore','positivefeedbackpercent','topratedseller','shippingservicecost','buyitnowavailable','currentprice','starttime','endtime','watchcount','conditionid')

    data_dict = {}

    for obj in headers_tuple:
        x = [element.text for element in soup.find_all(obj)]
        data_dict[obj] = x
    for key in (data_dict.keys() | full_dict.keys()):
        if key in data_dict: result.setdefault(key, []).append(data_dict[key])
        if key in full_dict: result.setdefault(key, []).append(full_dict[key])

final_dict = {k: list(flatten(v)) for k, v in result.items()}
df = pd.DataFrame.from_dict(final_dict, orient='index')
df = df.T

如果有人感兴趣，这就是我的答案。它工作但列顺序由于某种原因而改变，我不知道为什么。谢谢你的帮助！

将多个生成的数据帧组合到一个数据帧中

3 个答案: