Question

我尝试在Python中运行一个简单的爬虫：

import sys
import csv
import socket
import sqlite3
import logging
from optparse import OptionParser
from urlparse import urlparse
#pip install requests
import requests

#################################################################
# FUNCTION process_row_to_db.
#  handle one row and push to the DB
#
#################################################################

def process_row_to_db(conn, data_row, comment, hostname):
    insert_stmt = "INSERT OR IGNORE INTO adstxt (SITE_DOMAIN, EXCHANGE_DOMAIN, SELLER_ACCOUNT_ID, ACCOUNT_TYPE, TAG_ID, ENTRY_COMMENT) VALUES (?, ?, ?, ?, ?, ? );"
    exchange_host     = ''
    seller_account_id = ''
    account_type      = ''
    tag_id            = ''

    if len(data_row) >= 3:
        exchange_host     = data_row[0].lower()
        seller_account_id = data_row[1].lower()
        account_type      = data_row[2].lower()

    if len(data_row) == 4:
        tag_id            = data_row[3].lower()

    #data validation heurstics
    data_valid = 1;

    # Minimum length of a domain name is 1 character, not including extensions.
    # Domain Name Rules - Nic AG
    # www.nic.ag/rules.htm
    if(len(hostname) < 3):
        data_valid = 0

    if(len(exchange_host) < 3):
        data_valid = 0

    # could be single digit integers
    if(len(seller_account_id) < 1):
        data_valid = 0

    ## ads.txt supports 'DIRECT' and 'RESELLER'
    if(len(account_type) < 6):
        data_valid = 0

    if(data_valid > 0):
        logging.debug( "%s | %s | %s | %s | %s | %s" % (hostname, exchange_host, seller_account_id, account_type, tag_id, comment))

        # Insert a row of data using bind variables (protect against sql injection)
        c = conn.cursor()
        c.execute(insert_stmt, (hostname, exchange_host, seller_account_id, account_type, tag_id, comment))

        # Save (commit) the changes
        conn.commit()
        return 1

    return 0

# end process_row_to_db  #####

#################################################################
# FUNCTION crawl_to_db.
#  crawl the URLs, parse the data, validate and dump to a DB
#
#################################################################

def crawl_to_db(conn, crawl_url_queue):

    rowcnt = 0

    myheaders = {
            'User-Agent': 'AdsTxtCrawler/1.0; +https://github.com/InteractiveAdvertisingBureau/adstxtcrawler',
            'Accept': 'text/plain',
        }

    for aurl in crawl_url_queue:
        ahost = crawl_url_queue[aurl]
        logging.info(" Crawling  %s : %s " % (aurl, ahost))
        r = requests.get(aurl, headers=myheaders)
        logging.info("  %d" % r.status_code)

        if(r.status_code == 200):
            logging.debug("-------------")
            logging.debug(r.request.headers)
            logging.debug("-------------")
            logging.debug("%s" % r.text)
            logging.debug("-------------")

            tmpfile = 'tmpads.txt'
            with open(tmpfile, 'wb') as tmp_csv_file:
                tmp_csv_file.write(r.text)
                tmp_csv_file.close()

            with open(tmpfile, 'rb') as tmp_csv_file:
                #read the line, split on first comment and keep what is to the left (if any found)
                line_reader = csv.reader(tmp_csv_file, delimiter='#', quotechar='|')
                comment = ''

                for line in line_reader:
                    logging.debug("DATA:  %s" % line)

                    try:
                        data_line = line[0]
                    except:
                        data_line = "";

                    #determine delimiter, conservative = do it per row
                    if data_line.find(",") != -1:
                        data_delimiter = ','
                    elif data_line.find("\t") != -1:
                        data_delimiter = '\t'
                    else:
                        data_delimiter = ' '

                    data_reader = csv.reader([data_line], delimiter=',', quotechar='|')
                    for row in data_reader:

                        if len(row) > 0 and row[0].startswith( '#' ):
                            continue

                        if (len(line) > 1) and (len(line[1]) > 0):
                             comment = line[1]

                        rowcnt = rowcnt + process_row_to_db(conn, row, comment, ahost)

    return rowcnt

# end crawl_to_db  #####

#################################################################
# FUNCTION load_url_queue
#  Load the target set of URLs and reduce to an ads.txt domains queue
#
#################################################################

def load_url_queue(csvfilename, url_queue):
    cnt = 0

    with open(csvfilename, 'rb') as csvfile:
        targets_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in targets_reader:

            if len(row) < 1 or row[0].startswith( '#' ):
                continue

            for item in row:
                host = "localhost"

                if  "http:" in item or "https:" in item :
                    logging.info( "URL: %s" % item)
                    parsed_uri = urlparse(row[0])
                    host = parsed_uri.netloc
                else:
                    host = item
                    logging.info( "HOST: %s" % item)

            skip = 0

            try:
                #print "Checking DNS: %s" % host
                ip = socket.gethostbyname(host)

                if "127.0.0" in ip:
                    skip = 0 #swap to 1 to skip localhost testing
                elif "0.0.0.0" in ip:
                    skip = 1
                else:
                    logging.info("  Validated Host IP: %s" % ip)
            except:
                skip = 1

            if(skip < 1):
                ads_txt_url = 'http://{thehost}/ads.txt'.format(thehost=host)
                logging.info("  pushing %s" % ads_txt_url)
                url_queue[ads_txt_url] = host
                cnt = cnt + 1

    return cnt

# end load_url_queue  #####

#### MAIN ####

arg_parser = OptionParser()
arg_parser.add_option("-t", "--targets", dest="target_filename",
                  help="list of domains to crawl ads.txt from", metavar="FILE")
arg_parser.add_option("-d", "--database", dest="target_database",
                  help="Database to dump crawled data into", metavar="FILE")
arg_parser.add_option("-v", "--verbose", dest="verbose", action='count',
                  help="Increase verbosity (specify multiple times for more)")

(options, args) = arg_parser.parse_args()

if len(sys.argv)==1:
    arg_parser.print_help()
    exit(1)

log_level = logging.WARNING # default
if options.verbose == 1:
    log_level = logging.INFO
elif options.verbose >= 2:
    log_level = logging.DEBUG
logging.basicConfig(filename='adstxt_crawler.log',level=log_level,format='%(asctime)s %(filename)s:%(lineno)d:%(levelname)s  %(message)s')

crawl_url_queue = {}
conn = None
cnt_urls = 0
cnt_records = 0

cnt_urls = load_url_queue(options.target_filename, crawl_url_queue)

if (cnt_urls > 0) and options.target_database and (len(options.target_database) > 1):
    conn = sqlite3.connect(options.target_database)

with conn:
    cnt_records = crawl_to_db(conn, crawl_url_queue)
    if(cnt_records > 0):
        conn.commit()
    #conn.close()

print "Wrote %d records from %d URLs to %s" % (cnt_records, cnt_urls, options.target_database)

logging.warning("Wrote %d records from %d URLs to %s" % (cnt_records, cnt_urls, options.target_database))
logging.warning("Finished.")

我正在使用Python 2.7.9。我试着用这个命令安装sqlite：

python -m pip install sqlite

我回来了：

下载/解压缩sqlite3无法找到任何下载满足要求sqlite3正在清理...根本没有发行版找到sqlite3在... \ pip.log
中存储失败的调试日志

第一步是这个命令：

$sqlite3 adstxt.db < adstxt_crawler.sql

我得到了这些：

“'sqlite3'未被识别为内部或外部命令，可操作程序或批处理文件。”

我知道这是非常基本的，但我没有找到任何相关的帮助，如果你能帮助我，我真的很感激。感谢。

亚当

Answer 1

第一个错误：

'sqlite3' is not recognized as an internal or external command, operable program or batch file.

是因为您尝试运行sqlite命令行工具，该工具未安装在您的系统上。 Python 3包含sqlite但不提供独立命令sqlite3

第二个错误是语法错误。在Python 3中，print是标准函数，因此必须与括号一起使用

print('hello world')

您可能尝试使用Python 3解释器运行python 2代码

运行一个简单的Python代码

1 个答案: