使用beautifulsoup来抓取数据并使用python存储在sqlite3中

时间:2016-10-28 04:39:20

标签: python sqlite bs4

我是一个相当新手寻求帮助的一些编码,似乎我比我在这里咀嚼更多。 我应该从两个网站上抓取数据,收集两个游戏产品的价格,比较价格并将其存储在sqlite3数据库中,我可以以任何方式搜索,编辑或修改它。 目前我可以毫无问题地刮掉,但是将数据插入数据库是个问题。任何帮助将受到高度赞赏。

更新了源代码,仍然无法存储在db。

# import libraries
from bs4 import BeautifulSoup
import sqlite3
import sys
import urllib2
import datetime
import os
import requests


#Global variables
fishpond_price = []
fishpond_title = []
gameheader_price = []
gameheader_title = []
now = datetime.datetime.now()
time = now.strftime("%H:%M")
date = now.strftime("%d-%m-%Y")


 # combinelist=games1+games2
 #  for items in combinelist:
 #  c.execute("INSERT INTO user(name,store) VALUES(?,?);",[items[0],items[2]])
 #  c.execute("INSERT INTO user(name,store) VALUES(?,?);",[items[0],items[2]])

#Parsing Gameheader
def gameheader(games):
    print "Gameheader Prices_____________________"

    # scraping game data from gamehead
    for b in soup2.findAll('div', attrs={'class': 'price_area'}):
        #print b
        #c = b.find('div',{'class':'current_price_text'})
        #print c
        try:
            c = b.find('div', {'class':'current_price_text'})
            price2 = c.text
            print price2
        except:
            continue



#Parsing Fishpond
def fishpond(games):
    conn = sqlite3.connect('callofduty.db')
    print ("Updating prices from Fishpond")
    #Parse
    counter = 1
    for a in all_games.findAll('tr'):
        try:
            name = a.findAll('img', attrs={'class': 'photo'})
            print name[0]['title']
            fishpond_title.append(title)

            #Price
            price = a.findAll('span', attrs={'class': 'productSpecialPrice'})
            print price[0].text
            fishpond_price.append(price)
            #import pdb; pdb.set_trace()  # breakpoint 6384638a //

            #url
            site = a.findAll('head', attrs = {'class': 'base href'})
            print site[0].text

            games.append({'name': name[0]['title'], 'price': price[0].text, 'shop': 'Fishpond', 'site': site[0].text})

        except:
            continue
             ## NAME ## img and return the attr title
            ## PRICE ### use searach for div attr class_ = "productOriginalPrice"


        #INSERT INTO DATABASE USING FUNCTIONS
        for key in fishpond_price:
            conn.execute("INSERT INTO gameProduct (title, site, date, time) VALUES (?, ?, ?, ?) ", (key, "fishpond", date, time));

        for key in fishpond_title:
            conn.execute("INSERT INTO gamePrice (prod_id, price, date, time) VALUES (?, ?, ?, ?) ", (key, "fishpond", date, time));
    print games
    print ("Update Complete, {} Prices have been entered").format(counter)

    return games


#Creating database
conn = sqlite3.connect('callofduty.db') #<- initial database creation/connection
conn.execute('''CREATE TABLE IF NOT EXISTS gameProduct(prodCode INTEGER primarykey,prodName Text,prodSite Text);''')
conn.execute('''CREATE TABLE IF NOT EXISTS gamePrice(itemNum INTEGER,prodDate NUMERIC, prodTime NUMERIC,prodPrice NUMERIC);''')
print 'Detected first run, creating database'


#Scraping Data for Fishpond
# Games Website Url
# Fishpond url
fishpondURL = 'http://www.fishpond.com.au/q/call+of+duty?rid=1744546722'

# query the fishpond website and return the html to the variable 'page'
page = urllib2.urlopen(fishpondURL)

# parse the fishpond html using beautiful soap and store in variable `soup`
soup = BeautifulSoup(page, 'html.parser')

# # Take out the <div> of Fishpond name and get its value
name_box = soup.find('h1', attrs={'class': 'productSearch-price-container'})

#print all_games

prices = []

for link in soup.find_all("span", class_="productSpecialPrice"):
    prices.append(link.get_text())
#print prices
gamename = []
for link in soup.find_all("a", class_="blue_link fn url"):
    gamename.append(link.get_text())
#print gamename

print gamename[1], prices [1]
#Save into database
fishpondgames = zip(gamename, prices)

for fish in fishpondgames:
    print " : ".join(fish)


# #Site 2 Gameshead

all_games = soup.find_all("span", class_="category-products")

# Gamehead Url
gameURL = 'https://www.gamesmen.com.au/catalogsearch/result/?cat=&q=call+of+duty&dir=desc&order=relevance'

 #query the gamesmen website and return the html to the variable 'page'
page2 = urllib2.urlopen(gameURL)

#parse the gameheader html using beautiful soup and store in variable 'soup'
soup2 = BeautifulSoup(page2, 'html.parser')


all_games2 = soup.find_all("a", class_="category-products")

print all_games2

for link in soup.find_all("span", class_="price-box"):
    prices.append(link.get_text())


#print gamename

print gamename[1], prices [1]

#Save into database
gamesmen = zip(gamename, prices)

for game in gamesmen:
    print " : ".join(gamesmen)



conn.close()

0 个答案:

没有答案