Python SQLite插入具有可变条目数的列表

时间:2019-04-17 16:37:54

标签: python sqlite insert

我有一个Web抓取工具,它可以从站点提取数据,并将其放入各种列表中以插入到SQLite数据库中。今天我已经开始提取新数据,但是问题是,与以前的数据不同,我可以输入0到15种特定类型的数据。这是应用于卡片的特权-​​每个特权都有其名字和该特权的描述。

我已经准备好要插入列表中的所有数据,但是我不知道如何使insert语句动态化为使用正确的列名写入该语句的地方(quirk#where #是1-15之间的数字,quirk_desc#,其中#是1到15之间的数字),并带有正确的数据。

我知道如何使用executemany和问号以及列表本身,我正在其他需要插入的列和数据为静态的插入中执行此操作。

是否有办法读取列表中该行的最大列数,并使用变量使用正确的数字写出列名?我可以命名列表中的列,并让insert语句将列名拉为insert语句中的列名吗?

有问题的列表是quirk_data_final。我没有在代码中放入我的SQLite连接信息或任何内容。

代码从拉动每个团队的球员列表开始,并将其编译为链接列表。然后,它将遍历该列表,并根据播放器的类型提取所需的数据。

每个请求-数据库定义:

  

创建表“ Quirk”(       “玩家” TEXT,       “链接” TEXT,       “整体”整数,       “团队” TEXT,       “ Quirk 1” TEXT,       “ Quirk 1 Description” TEXT,       “ Quirk 2” TEXT,       “ Quirk 2 Description” TEXT,       “ Quirk 3” TEXT,       “ Quirk 3 Description” TEXT,       “ Quirk 4” TEXT,       “ Quirk 4 Description” TEXT,       “ Quirk 5” TEXT,       “ Quirk 5 Description” TEXT,       “ Quirk 6” TEXT,       “ Quirk 6 Description” TEXT,       “ Quirk 7” TEXT,       “ Quirk 7 Description” TEXT,       “ Quirk 8” TEXT,       “ Quirk 8 Description” TEXT,       “ Quirk 9” TEXT,       “ Quirk 9 Description” TEXT,       “ Quirk 10” TEXT,       “ Quirk 10 Description” TEXT,       “ Quirk 11” TEXT,       “ Quirk 11描述” TEXT,       “ Quirk 12” TEXT,       “ Quirk 12 Description” TEXT,       “ Quirk 13” TEXT,       “ Quirk 13 Description” TEXT,       “ Quirk 14” TEXT,       “ Quirk 14 Description” TEXT,       “ Quirk 15” TEXT,       “ Quirk 15 Description” TEXT,       “ Pull_Date_Time” NUMERIC   )

完整应用的代码:

from bs4 import BeautifulSoup
soup = BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import sqlite3

fp = (r'C:\Users\USER\AppData\Roaming\Mozilla\Firefox\Profiles\ceo61d4m.default')
opts = Options()
opts.profile = fp
opts.headless = True
options = webdriver.FirefoxOptions()
path = r'C:\Users\USER\AppData\Roaming\Mozilla\Firefox\Profiles\ceo61d4m.default'
options.add_argument("user-data-dir="+path)
quote_page = ('https://mlb19.theshownation.com/inventory?type=players&optional_id=2')
driver = webdriver.Firefox(executable_path=r'C:\Users\USER\.spyder-py3\geckodriver.exe', firefox_options = opts)
driver.get(quote_page)

soup = BeautifulSoup(driver.page_source, "lxml")

all_player_data_P = []
all_player_data_NoP = []
all_player_quirk_data = []
quirk_data_final = []

def pull_player(url2):
    global all_player_data_P
    global all_player_data_NoP
    global quirk_data_final
    stats_data = []
    general_data = []
    pitching_data = []
    pitch_speeds = []
    driver.get(url2)
    soup = BeautifulSoup(driver.page_source, "lxml")
    span = soup.find('div', class_='card-asset')
    Last = span.find('div', class_='card-name-last').string
    First = span.find('div', class_='card-name-first').string
    try:
        Player = First+' '+Last
    except TypeError:
        Player = First
    Link = url2
    Team = span.find('div', class_='card-team').string
    span2 = soup.find('div', class_='widget-main title-widget-main')
    General = span2.find('h1').getText()
    list = General.split()
    Position = list[0]
    Number = list[1]
    rows = span2.find_all('div', class_='flex-table-cell')
    for each in rows:
        general_data.append(each.text.strip().encode('utf-8'))
    Overall = general_data[0].decode("utf-8") 
    Bats = general_data[1].decode("utf-8") 
    Throws = general_data[2].decode("utf-8") 
    Secondary = general_data[3].decode("utf-8")
    Secondary2 = str(Secondary)
    Secondary2 = Secondary2.replace(',', '-')
    Weight = general_data[4].decode("utf-8") 
    Height = general_data[5].decode("utf-8")
    Height2 = int(''.join(filter(str.isdigit, Height)))
    Height2 = str(Height2)
    Height2 = Height2[:1]+'.'+Height2[1:]
    Age = general_data[6].decode("utf-8") 
    Born = general_data[7].decode("utf-8") 
    span3 = soup.find('div', class_='player-stat-tables-right')

    all_player_quirk_data.append([Player, Link, Overall, Team])
    quirks = soup.find('div', class_='quirk-wrapper')              
    for length in quirks.find_all('div', class_='quirk-item'):
        for quirk in length.find_all("strong"):
            all_player_quirk_data.append(quirk.contents)
        for quirkdesc in length.find_all('br'):
            all_player_quirk_data.append(quirkdesc.next_sibling.strip())            


    for line in all_player_quirk_data:
        if line not in quirk_data_final:
            quirk_data_final.append(line)

    if Position in ('SP','RP','CP'):
        attributes = span2.find_all('div', class_='player-attr-number')
        for each in attributes:
            stats_data.append(each.text.strip().encode('utf-8'))  
        for length in span3.find_all("strong"):
            pitch_speeds.append(length.next_sibling.strip())
            pitching_data.append(length.contents)
        Pitch_1 = str(pitching_data[0])
        Pitch_2 = str(pitching_data[1])
        Pitch_3 = str(pitching_data[2])
        try:
            Pitch_4 = str(pitching_data[3])
        except IndexError:
            Pitch_4 = 'null'
        try:
            Pitch_5 = str(pitching_data[4])
        except IndexError:
            Pitch_5 = 'null'
        Pitch_1_Speed = str(pitch_speeds[0])
        Pitch_2_Speed = str(pitch_speeds[1])
        Pitch_3_Speed = str(pitch_speeds[2]) 
        try:
            Pitch_4_Speed = str(pitch_speeds[3])
        except IndexError:
            Pitch_4_Speed = 'null'
        try:
            Pitch_5_Speed = str(pitch_speeds[4])
        except IndexError:
            Pitch_5_Speed = 'null'
        Pitch_1 = Pitch_1.translate({ord(i): None for i in '[]'})
        Pitch_2 = Pitch_2.translate({ord(i): None for i in '[]'})
        Pitch_3 = Pitch_3.translate({ord(i): None for i in '[]'})
        Pitch_4 = Pitch_4.translate({ord(i): None for i in '[]'})
        Pitch_5 = Pitch_5.translate({ord(i): None for i in '[]'})
        Pitch_1_Speed = Pitch_1_Speed.translate({ord(i): None for i in '(),'})
        Pitch_2_Speed = Pitch_2_Speed.translate({ord(i): None for i in '(),'})
        Pitch_3_Speed = Pitch_3_Speed.translate({ord(i): None for i in '(),'})
        Pitch_4_Speed = Pitch_4_Speed.translate({ord(i): None for i in '(),'})
        Pitch_5_Speed = Pitch_5_Speed.translate({ord(i): None for i in '(),'})
        Stamina = stats_data[0].decode("utf-8")
        Hper9 =stats_data[1].decode("utf-8")
        Kper9 =stats_data[2].decode("utf-8")
        BBper9 =stats_data[3].decode("utf-8")
        HRper9 =stats_data[4].decode("utf-8")
        Clutch_P =stats_data[5].decode("utf-8")
        Control =stats_data[6].decode("utf-8")
        Velocity =stats_data[7].decode("utf-8")
        Break =stats_data[8].decode("utf-8")
        Contact_Right =stats_data[9].decode("utf-8")
        Contact_Left =stats_data[10].decode("utf-8")
        Power_Right =stats_data[11].decode("utf-8")
        Power_Left =stats_data[12].decode("utf-8")
        Vision =stats_data[13].decode("utf-8")
        Discipline =stats_data[14].decode("utf-8")
        Clutch_H =stats_data[15].decode("utf-8")
        Bunt =stats_data[16].decode("utf-8")
        Drag_Bunt =stats_data[17].decode("utf-8")
        Durability =stats_data[18].decode("utf-8")
        Fielding =stats_data[19].decode("utf-8")
        Arm_Strength =stats_data[20].decode("utf-8")
        Arm_Accuracy =stats_data[21].decode("utf-8")
        Reaction =stats_data[22].decode("utf-8")
        Speed =stats_data[23].decode("utf-8")
        Steal =stats_data[24].decode("utf-8")
        Baserunning_Aggressiveness =stats_data[25].decode("utf-8")
        all_player_data_P.append([Player, Link, Overall, Team, Position, Number, Bats, Throws, Secondary2, Weight, Height2, Age, Born, 
        Pitch_1,
        Pitch_1_Speed,
        Pitch_2,
        Pitch_2_Speed,
        Pitch_3,
        Pitch_3_Speed,
        Pitch_4,
        Pitch_4_Speed,
        Pitch_5,        
        Pitch_5_Speed,
        Stamina,
        Hper9,
        Kper9,
        BBper9,
        HRper9,
        Clutch_P,
        Control,
        Velocity,
        Break,
        Contact_Right,
        Contact_Left,
        Power_Right,
        Power_Left,
        Vision,
        Discipline,
        Clutch_H,
        Bunt,
        Drag_Bunt,
        Durability,
        Fielding,
        Arm_Strength,
        Arm_Accuracy,
        Reaction,
        Speed,
        Steal,
        Baserunning_Aggressiveness])
        print('Added: '+Player)
    else:
        attributes = span2.find_all('div', class_='player-attr-number')
        for each in attributes:
            stats_data.append(each.text.strip().encode('utf-8'))
        Contact_Right =stats_data[0].decode("utf-8")
        Contact_Left =stats_data[1].decode("utf-8")
        Power_Right =stats_data[2].decode("utf-8")
        Power_Left =stats_data[3].decode("utf-8")
        Vision =stats_data[4].decode("utf-8")
        Discipline =stats_data[5].decode("utf-8")
        Clutch_H =stats_data[6].decode("utf-8")
        Bunt =stats_data[7].decode("utf-8")
        Drag_Bunt =stats_data[8].decode("utf-8")
        Durability =stats_data[9].decode("utf-8")
        Fielding =stats_data[10].decode("utf-8")
        Arm_Strength =stats_data[11].decode("utf-8")
        Arm_Accuracy =stats_data[12].decode("utf-8")
        Reaction =stats_data[13].decode("utf-8")
        if Position == 'C' or (Secondary in ('C') and Secondary not in 'CF'):
            Blocking = stats_data[14].decode("utf-8")
            Speed =stats_data[15].decode("utf-8")
            Steal =stats_data[16].decode("utf-8")
            Baserunning_Aggressiveness =stats_data[17].decode("utf-8")
        else:
            Blocking = ''
            Speed =stats_data[14].decode("utf-8")
            Steal =stats_data[15].decode("utf-8")
            Baserunning_Aggressiveness =stats_data[16].decode("utf-8")
        all_player_data_NoP.append([Player, Link, Overall, Team, Position, Number, Bats, Throws, Secondary2, Weight, Height2, Age, Born,    
        Contact_Right,
        Contact_Left,
        Power_Right,
        Power_Left,
        Vision,
        Discipline,
        Clutch_H,
        Bunt,
        Drag_Bunt,
        Durability,
        Fielding,
        Arm_Strength,
        Arm_Accuracy,
        Reaction,
        Blocking, 
        Speed,
        Steal,
        Baserunning_Aggressiveness])
        print('Added: '+Player)
    return;

data = []
teams = 28#-1

while teams < 30:
    if teams == -1:
        page_count_max = 36
    else:
        page_count_max = 7
    page = 1
    while True:        
        urltemplate = "https://mlb19.theshownation.com/inventory?page={page}&type=players&optional_id={team}"
        url = urltemplate.format(page=page, team=teams)
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, "lxml")
        spans = soup.find_all('div', class_='flex-grid-quad-box')
        for span in spans:   
            Link = 'https://mlb19.theshownation.com'+span.a['href']
            data.append(Link)
        page += 1
        print (f'Working: Team- {teams} Page- {page}')
        if page > page_count_max:
            break        
    teams +=1    

for index in data:
    pull_player(url2 = index) 

print(quirk_data_final)

1 个答案:

答案 0 :(得分:1)

如前所述,考虑一个长而规范化的表,并避免使用编号为 Quirks 的命名列。为了在您的用户中进行分配,请创建一个视图(即保存的SELECT查询)来满足​​您的报告需求。

SQLite (理想情况下,不会重复播放器及其特征,而是将其存储在相关的主表中)

CREATE TABLE "Quirks" (
   "ID" INTEGER PRIMARY KEY,
   "Player" TEXT, 
   "Link" TEXT, 
   "Overall" INTEGER, 
   "Team" TEXT, 
   "Quirk_Num" INTEGER,
   "Quirk" TEXT, 
   "Description" TEXT
);

-- CONDITIONAL AGGREGATION TO REPLICATE ORIGINAL TABLE SETUP
CREATE VIEW Quirk_View AS

    SELECT Player, Link, Overall, Team,
           MAX(CASE WHEN Quirk_Num = 1 THEN Quirk ELSE NULL END) AS [Quirk 1],
           MAX(CASE WHEN Quirk_Num = 1 THEN Description ELSE NULL END) AS [Quirk 1 Description],
           MAX(CASE WHEN Quirk_Num = 2 THEN Quirk ELSE NULL END) AS [Quirk 2],
           MAX(CASE WHEN Quirk_Num = 2 THEN Description ELSE NULL END) AS [Quirk 2 Description],
           MAX(CASE WHEN Quirk_Num = 3 THEN Quirk ELSE NULL END) AS [Quirk 3],
           MAX(CASE WHEN Quirk_Num = 3 THEN Description ELSE NULL END) AS [Quirk 3 Description],
           ...
    FROM Quirks
    GROUP BY Player, Link, Overall, Team;

HTML (假定要刮取的怪异部分)

...
<div class="quirk-wrapper">
    <div class_='quirk-item'>
        <strong>Quirk 1</strong>
        <br/>Quick 1 Description
    </div>
    <div class_='quirk-item'>
        <strong>Quirk 2</strong>
        <br/>Quick 2 Description
    </div>
    <div class_='quirk-item'>
        <strong>Quirk 3</strong>
        <br/>Quick 3 Description
    </div>
    ...
</div>
...

Python

import sqlite3
from bs4 import BeautifulSoup
...

soup = ... ;  conn = ...; cur = ...

### WEB SCRAPE
all_player_quirk_data = []

quirks = soup.find('div', class_='quirk-wrapper')   

# USING enumerate() FOR ITERATOR NUMBER
for i, item in enumerate(quirks.find_all('div', class_='quirk-item')):
    Quirk = None; Desc = None

    if item.find("strong") is not None:
        Quirk = item.find("strong").contents
    if item.find("br") is not None:
        Desc = item.find('br').next_sibling.strip()

    all_player_quirk_data.append([Player, Link, Overall, Team, i, Quirk, Desc])

### DATABASE APPEND
sql = """INSERT INTO Quirks (Player, Link, Overall, Team, Quirk_Num, Quirk, Description)
         VALUES (?, ?, ?, ?, ?, ?, ?)
      """ 
cur.executemany(sql, all_player_quirk_data)
conn.commit()