import pandas as pd
import requests
from bs4 import BeautifulSoup
res = requests.get("https://www.digikey.com/products/en/inductors-coils-
chokes/fixed-inductors/71/page/1")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))[0]
part_numbers = df["Manufacturer Part Number"].tolist()
manufacturer = df["Manufacturer"].tolist()
quantity_available = df["Quantity Available"].tolist()
m_qty = df["Minimum Quantity"].tolist()
types = df["Type"].tolist()
material = df["Material - Core"].tolist()
inductance = df["Inductance"].tolist()
tolerance = df["Tolerance"].tolist()
current_rating = df["Current Rating"].tolist()
current_saturation = df["Current - Saturation"].tolist()
shielding = df["Shielding"].tolist()
resistances = df["DC Resistance (DCR)"].tolist()
freq = df["Q @ Freq"].tolist()
frequency = df["Frequency - Self Resonant"].tolist()
ratings = df["Ratings"].tolist()
operating_temperature = df["Operating Temperature"].tolist()
i_frequency = df["Inductance Frequency - Test"].tolist()
mounting_type = df["Mounting Type"].tolist()
package = df["Package / Case"].tolist()
s_package = df["Supplier Device Package"].tolist()
size_dimension = df["Size / Dimension"].tolist()
height = df["Height - Seated (Max)"].tolist()
unit_price = df["Unit Price,USD"].tolist()
我正在尝试进行网络抓取,但是我无法弄清楚如何导入单价,您可以看到在代码末尾,其他代码运行良好,但是单价代码却无法正常工作。我必须弄清楚HTML代码,该如何描述?另外,我需要一个循环来下载下一页。
答案 0 :(得分:0)
我解决了从数据帧unit_price
中选择df
的问题。您将需要弄清楚循环部分。
代码:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
res = requests.get("https://www.digikey.com/products/en/inductors-coils-chokes/fixed-inductors/71/page/1")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))[0]
df.columns = [re.sub("\s\s+" , " ", str(i).strip().replace('\n', '')) for i in list((df.columns.get_level_values(0)))]
print(df.columns)
part_numbers = df["Manufacturer Part Number"].values.tolist()
manufacturer = df["Manufacturer"].values.tolist()
quantity_available = df["Quantity Available"].values.tolist()
m_qty = df["Minimum Quantity"].values.tolist()
types = df["Type"].values.tolist()
material = df["Material - Core"].values.tolist()
inductance = df["Inductance"].values.tolist()
tolerance = df["Tolerance"].values.tolist()
current_rating = df["Current Rating"].values.tolist()
current_saturation = df["Current - Saturation"].values.tolist()
shielding = df["Shielding"].values.tolist()
resistances = df["DC Resistance (DCR)"].values.tolist()
freq = df["Q @ Freq"].values.tolist()
frequency = df["Frequency - Self Resonant"].values.tolist()
ratings = df["Ratings"].values.tolist()
operating_temperature = df["Operating Temperature"].values.tolist()
i_frequency = df["Inductance Frequency - Test"].values.tolist()
mounting_type = df["Mounting Type"].values.tolist()
package = df["Package / Case"].values.tolist()
s_package = df["Supplier Device Package"].values.tolist()
size_dimension = df["Size / Dimension"].values.tolist()
height = df["Height - Seated (Max)"].values.tolist()
unit_price = df["Unit Price USD"].values.tolist()
print(df.head())
输出:
Index(['Compare Parts', '', 'Image', 'Digi-Key Part Number',
'Manufacturer Part Number', 'Manufacturer', 'Description',
'Quantity Available', 'Unit Price USD', 'Minimum Quantity', 'Packaging',
'Series', 'Part Status', 'Type', 'Material - Core', 'Inductance',
'Tolerance', 'Current Rating', 'Current - Saturation', 'Shielding',
'DC Resistance (DCR)', 'Q @ Freq', 'Frequency - Self Resonant',
'Ratings', 'Operating Temperature', 'Inductance Frequency - Test',
'Features', 'Mounting Type', 'Package / Case',
'Supplier Device Package', 'Size / Dimension', 'Height - Seated (Max)'],
dtype='object')
Compare Parts Image Digi-Key Part Number Manufacturer Part Number \
0 NaN NaN NaN 732-1728-2-ND 744765116A
1 NaN NaN NaN 732-1728-1-ND 744765116A
2 NaN NaN NaN 732-1728-6-ND 744765116A
3 NaN NaN NaN 732-10762-2-ND 74404043220A
4 NaN NaN NaN 732-10762-1-ND 74404043220A
Manufacturer Description \
0 Wurth Electronics Inc. FIXED IND 16NH 560MA 220 MOHM
1 Wurth Electronics Inc. FIXED IND 16NH 560MA 220 MOHM
2 Wurth Electronics Inc. FIXED IND 16NH 560MA 220 MOHM
3 Wurth Electronics Inc. FIXED IND 22UH 1.11A 200 MOHM
4 Wurth Electronics Inc. FIXED IND 22UH 1.11A 200 MOHM
Quantity Available Unit Price USD \
0 39,000 - Immediate Available: 39,000 $0.25500
1 39,942 - Immediate Available: 39,942 $0.41000
2 39,942 - Immediate Available: 39,942 Digi-Reel®
3 18,000 - Immediate Available: 18,000 $0.31736
4 20,170 - Immediate Available: 20,170 $0.60000
Minimum Quantity ... Frequency - Self Resonant \
0 3,000 Minimum: 3,000 ... 3.1GHz
1 1 Minimum: 1 ... 3.1GHz
2 1 Minimum: 1 ... 3.1GHz
3 3,000 Minimum: 3,000 ... 26MHz
4 1 Minimum: 1 ... 26MHz
Ratings Operating Temperature Inductance Frequency - Test Features \
0 - -40°C ~ 125°C 250MHz -
1 - -40°C ~ 125°C 250MHz -
2 - -40°C ~ 125°C 250MHz -
3 - -40°C ~ 125°C 100kHz -
4 - -40°C ~ 125°C 100kHz -
Mounting Type Package / Case Supplier Device Package \
0 Surface Mount 0402 (1005 Metric) 0402 (1005 Metric)
1 Surface Mount 0402 (1005 Metric) 0402 (1005 Metric)
2 Surface Mount 0402 (1005 Metric) 0402 (1005 Metric)
3 Surface Mount Nonstandard SMD
4 Surface Mount Nonstandard SMD
Size / Dimension Height - Seated (Max)
0 0.039" L x 0.022" W (1.00mm x 0.55mm) 0.024" (0.60mm)
1 0.039" L x 0.022" W (1.00mm x 0.55mm) 0.024" (0.60mm)
2 0.039" L x 0.022" W (1.00mm x 0.55mm) 0.024" (0.60mm)
3 0.157" L x 0.157" W (4.00mm x 4.00mm) 0.098" (2.50mm)
4 0.157" L x 0.157" W (4.00mm x 4.00mm) 0.098" (2.50mm)
[5 rows x 32 columns]
答案 1 :(得分:0)
除了Ali的答案,这是您想要的循环:
... your initial soup and page parsing ...
while soup.find('a', class_='Next'): # here we search next button on page
res = requests.get('https://www.digikey.com/' + soup.find('a', class_='Next')['href']) # "clicking on next"
soup = BeautifulSoup(res.content,'lxml')
table = soup.find('table', class_='productTable') # added class of demanded table instead of selecting zero element from list
df = pd.read_html(str(table))[0]
...and all your scraping logic goes here...