我目前在一个学校项目中,该项目涉及对网站进行爬网以收集信息。我在Python中使用BeautifulSoup4,并成功将其另存为.txt。
如何我似乎很难将其上传到MySql Server。我正在使用pymysql。我认为INSERT命令不起作用。我已经尝试过commit和Internet上建议的所有其他解决方案。
这是当前代码:
import requests
import pymysql
from bs4 import BeautifulSoup
db = pymysql.connect(host="localhost",
user="root",
password="1234",
db="bookstore",
charset='utf8')
x = db.cursor()
def spider(origin,page):
while 1:
url = origin + str(page)
req = requests.get(url)
html = req.text
soup = BeautifulSoup(html, 'lxml')
print(page)
try:
title = soup.find('a', {'class': 'p_topt01'})
try:
x.execute('INSERT INTO book(title) VALUES ('+title.text+')')
except:
page += 1
continue
author = soup.find_all('a', class_="np_af")
x.execute('INSERT INTO book(author) VALUES('+author[0].text+')')
a = 1
while a < (len(author)):
if 'PublisherSearch' in str(author[a].get('href')):
x.execute('INSERT INTO book(publisher) VALUES(' +
author[a].text + ')')
elif 'AuthorSearch' in str(author[a].get('href')):
x.execute('INSERT INTO book(translator) VALUES(' + author[a].text + ')')
elif 'Foreign&SearchWord' in str(author[a].get('href')):
x.execute('INSERT INTO book(originTitle) VALUES(' + author[a].text + ')')
a += 1
price = soup.find('td', {'class': 'p_goodstd02', 'valign': 'top'})
x.execute('INSERT INTO book(price) VALUES(' + price.text + ')')
size = soup.select('div[class="p_goodstd03"] td[style="text-align:left;"]')
try:
tmp = str(size[0].text)
except:
x.execute('INSERT INTO book(id) VALUES (' + str(page) + ')')
page += 1
continue
idx = tmp.find('|')
index = tmp.find('쪽')
if idx > index:
x.execute('INSERT INTO book(BookLength) VALUES(' + tmp[:index] + ')')
else:
x.execute('INSERT INTO book(BookLength) VALUES(' + tmp[idx+2:index] + ')')
index = tmp.find(')')
if index != -1:
x.execute('INSERT INTO book(sizeInfo) VALUES(' + tmp[index - 14:index + 1].lstrip() + ')')
index = tmp.find('g')
if index != -1:
x.execute('INSERT INTO book(sizeInfo) VALUES(' + tmp[index - 4:index].lstrip() + ')')
index = tmp.find('ISBN : ')
if index != -1:
x.execute('INSERT INTO book(BookLength) VALUES(' + tmp[index + 7:] + ')')
print(page)
x.execute('INSERT INTO book(id) VALUES (' + str(page) + ')')
page += 1
except:
x.execute('INSERT INTO book(id) VALUES (' + str(page) + ')')
page += 1
continue
spider('http://www.aladin.co.kr/shop/wproduct.aspx?ItemId=',1)
db.close()