Question

我使用python和sqlite3制作了简单的爬虫。但是cmd屏幕中有一些错误。所以我从stackoverflow.com搜索了这种错误。但我找不到解决方案。一些Q＆amp; A建议我必须在?这样的sqlite命令上使用%而不是SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url。但它没有用。

这是错误。

Traceback (most recent call last):
  File "C:\Python27\crawl.py", line 239, in (module)
    parseArticle( u )
  File "C:\Python27\crawl.py", line 146, in parseArticle
    gaterNeighborInfo(soup)
  File "C:\Python27\crawl.py", line 68, in gaterNeighborInfo
    if url and url.startswith('http://') and db.isCrawledURL(url)<1:
  File "C:\Python27\crawl.py", line 217, in isCrawledURL
    self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url) 
OperationalError: near "state": syntax error

如您所见，此错误似乎是分层的。但我不知道出了什么问题以及这个错误从哪里开始。

这是源代码。

# -*- coding: utf-8 -*-

from BeautifulSoup import BeautifulSoup
import robotparser
import urllib2
import time, traceback, re, sys, os
import sqlite3

crawler_name = 'python_daum_crawler'
mainpage = 'http://blog.daum.net/'
mainpath = './data/'

# robot parser를 설정합니다.
rp = robotparser.RobotFileParser(mainpage + 'robot.txt')
rp.read()

def canFetch(url):
    "수집 가능 여부를 체크합니다."
    return rp.can_fetch(crawler_name, url)

def getContent(url, delay=1):
    "웹문서를 다운로드 합니다."
    time.sleep(delay)

    if not canFetch(url):
        # 웹마스터가 수집을 원치 않는 페이지는 수집을 하지 않습니다.
        print('This url can NOT be fetched by our crawler :', url)
        return None
    try:
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', crawler_name)]
        contents = opener.open(url).read()
    except:
        traceback.print_exc()
        return None
    return contents

def getArticleInfo(soup):
    "daum blog 내의 article info를 얻어 옵니다."

    rBlog = re.compile('.+blog.daum.net/|w+/|d+.*?')
    URLs = soup('a',{'href':rBlog})

    return [ u.get('href').split('?')[0] for u in URLs ]

def getOwnArticles(contents):
    "해당 블로그에 포함되는 글의 목록을 가져옵니다."
    ret = []
    soup = BeautifulSoup(contents)
    rBlog = re.compile('.+/BlogView.+')
    for u in soup('a', {'href':rBlog}):
        href = u.get('href')
        article = href.split('articleno=')[1].split('&')[0]
        if ret.count(article)<1:
            ret.append(article)
    return ret

def gatherNeighborInfo(soup):
    "이웃 블로거/혹은 다녀간 블로거 정보를 수집합니다."

    #daum blog 관련 주소를 찾습니다.
    rBlog = re.compile('http://blog.daum.net/|w+')
    Neighbors = soup('a',{'href':rBlog})
    cnt = 0
    for n in Neighbors:
        url = n.get('href')
        blogname = url.split('/')[-1]
        if url and url.startswith('http://') and db.isCrawledURL(url)<1:
            db.insertURL( url, 1 )

            url2 = getRedirectedURL(url)
            if not url2: continue
            re_url = 'http://blog.daum.net' + url2
            body = getContent(re_url, 0)
            if body:
                for u in getOwnArticles(body):
                    #자신의 글 주소를 db에 저장합니다.
                    fullpath = 'http://blog.daum.net/'+blogname+'/'+u
                    cnt += db.insertURL(fullpath)
    if cnt>0: print('%d neighbor articles inserted'%cnt)

def getRedirectedURL(url):
    "본문에 해당하는 프레임의 url을 얻어옵니다."
    contents = getContent(url)
    if not contents: return None

    #redirect
    try:
        soup = BeautifulSoup(contents)
        frame = soup('frame')
        src = frame[0].get('src')
    except:
        src = None
    return src

def getBody(soup, parent):
    "본문 텍스트를 구합니다."

    #본문 주소를 포함한 iframe을 찾습니다.
    rSrc = re.compile('.+/ArticleContentsView.+')
    iframe = soup('iframe',{'src':rSrc})
    if len(iframe)>0:
        src = iframe[0].get('src')
        iframe_src = 'http://blog.daum.net'+src

        #그냥 request하면 안 되고, referer를 지정해야 browser를 통해 요청한 것으로 인식합니다.
        req = urllib2.Request(iframe_src)
        req.add_header('Refere', parent)
        body = urllib2.urlopen(req).read()
        soup = BeautifulSoup(body)
        return str(soup.body)
    else:
        print('NULL contents')
        return ''

def parseArticle(url):
    "해당 url을 parsing하고 저장합니다."

    #blog id와 article id를 얻습니다.
    article_id = url.split('/')[-1]
    blog_id = url.split('/')[-2]

    #redirect된 주소를 얻어 옵니다.
    newURL = getRedirectedURL(url)

    if newURL:
        try:
            #blog 디렉터리를 만듭니다.
            os.mkdir(mainpath+blog_id)
        except:
            #디렉터리를 만들다 에러가 난 경우 무시합니다.
            pass

        newURL = 'http://blog.daum.net'+newURL
        contents = getContent(newURL, 0)
        if not contents:
            print('Null Contents...')
            #해당 url이 유효하지 않은 경우 에러(-1)로 표시합니다.
            db.updateURL(url, -1)
            return

        #HTML을 파싱합니다.
        soup = BeautifulSoup(contents)

        #이웃 블로거 정보가 있나 확인합니다.
        gatherNeighborInfo(soup)

        #블로그 URL이 있을 경우 db에 삽입합니다.
        n=0
        for u in getArticleInfo(soup):
            n += db.insertURL(u)
        if n>0: print('inserted %d urls from %s'%(n,url))

        #title을 얻습니다.
        sp = contents.find('<title>')
        if sp>-1:
            ep = contents[sp+7:].find('<title>')
            title = contents[sp+7:sp+ep+7]
        else:
            title = ''

        #본문 HTML을 보기 쉽게 정리합니다.
        contents = getBody(soup, newURL)

        #script를 제거합니다.
        pStyle = re.compile('<style(.*?)>(.*?)</style>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
        contents = pStyle.sub('', contents)
        pStyle = re.compile('<script(.*?)>(.*?)</script>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
        contents = pStyle.sub('', contents)
        pStyle = re.compile("<(.*?)>", re.IGNORECASE | re.MULTILINE | re.DOTALL )
        contents = pStyle.sub('', contents)

        #txt file을 저장합니다.
        fTXT = open( mainpath + blog_id + '/' + article_id + '.txt', 'w')
        fTXT.write( title+'|n')
        fTXT.write(contents)
        fTXT.close()

        #처리했다고 db에 표시합니다.
        db.updateURL(url)

    else:
        print('Invalid blog article...')
        #해당 url이 유효하지 않은 경우 에러(-1)로 표시합니다.
        db.updateURL(url, -1)

class DB:
    "SQLITE3 wrapper class"
    def __init__(self):
        self.conn = sqlite3.connect('crawlerDB')
        self.cursor = self.conn.cursor()
        self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url text, state int)')
        self.cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS IDX001 ON urls(url)')
        self.cursor.execute('CREATE INDEX IF NOT EXISTS IDX002 ON urls(state)')

    def __del__(self):
        self.conn.commit()
        self.cursor.close()

    def insertURL(self, url, state=0):
        try:
            self.cursor.execute("INSERT INTO urls VALUES ('%s',%d)"%(url,state))
            self.conn.commit()
        except:
            return 0
        else:
            return 1

    def selectUncrawledURL(self):
        self.cursor.execute('SELECT * FROM urls where state=0')
        return [ row[0] for row in self.cursor.fetchall() ]

    def updateURL(self, url, state=1):
        self.cursor.execute("UPDATE urls SET state=%d WHERE url='%s'"%(state,url))

    def isCrawledURL(self, url):
        self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url)
        ret = self.cursor.fetchone()
        return ret[0]

db = DB()

if __name__=='__main__':
    print('starting crawl.py...')

    #메인 페이지를 체크합니다.
    contents = getContent(mainpage)
    URLs = getArticleInfo( BeautifulSoup( contents ) )
    nSuccess = 0
    for u in URLs:
        nSuccess += db.insertURL(u)
    print('inserted %d new pages.'%nSuccess)

    while 1:
        for u in db.selectUncrawledURL():
            #아직 읽지 않은 url을 얻어서 처리합니다.
            print('downloading %s'%u)
            try:
                parseArticle( u )
            except:
                traceback.print_exc()
                db.updateURL( u, -1 )

Answer 1

您正在生成不正确的SQL;您可能需要url=... AND state=1（空格和AND符合这两个条件。

此外，您不应使用字符串插值，而是使用SQL参数：

def isCrawledURL(self, url):
    self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url=? AND state=1", (url,))
    ret = self.cursor.fetchone()
    return ret[0]

这适用于您的所有查询，例如：

self.cursor.execute("INSERT INTO urls VALUES (?, ?)", (url,state))

和

self.cursor.execute("UPDATE urls SET state=? WHERE url=?", (state,url))

请注意，参数作为 second 参数（值序列）传递到cursor.execute()调用中。

Answer 2

在查询中显示状态之前，您缺少空格和AND关键字。

OperationalError：接近“state”：语法错误

2 个答案: