我尝试在Python中运行一个简单的爬虫:
import sys
import csv
import socket
import sqlite3
import logging
from optparse import OptionParser
from urlparse import urlparse
#pip install requests
import requests
#################################################################
# FUNCTION process_row_to_db.
# handle one row and push to the DB
#
#################################################################
def process_row_to_db(conn, data_row, comment, hostname):
insert_stmt = "INSERT OR IGNORE INTO adstxt (SITE_DOMAIN, EXCHANGE_DOMAIN, SELLER_ACCOUNT_ID, ACCOUNT_TYPE, TAG_ID, ENTRY_COMMENT) VALUES (?, ?, ?, ?, ?, ? );"
exchange_host = ''
seller_account_id = ''
account_type = ''
tag_id = ''
if len(data_row) >= 3:
exchange_host = data_row[0].lower()
seller_account_id = data_row[1].lower()
account_type = data_row[2].lower()
if len(data_row) == 4:
tag_id = data_row[3].lower()
#data validation heurstics
data_valid = 1;
# Minimum length of a domain name is 1 character, not including extensions.
# Domain Name Rules - Nic AG
# www.nic.ag/rules.htm
if(len(hostname) < 3):
data_valid = 0
if(len(exchange_host) < 3):
data_valid = 0
# could be single digit integers
if(len(seller_account_id) < 1):
data_valid = 0
## ads.txt supports 'DIRECT' and 'RESELLER'
if(len(account_type) < 6):
data_valid = 0
if(data_valid > 0):
logging.debug( "%s | %s | %s | %s | %s | %s" % (hostname, exchange_host, seller_account_id, account_type, tag_id, comment))
# Insert a row of data using bind variables (protect against sql injection)
c = conn.cursor()
c.execute(insert_stmt, (hostname, exchange_host, seller_account_id, account_type, tag_id, comment))
# Save (commit) the changes
conn.commit()
return 1
return 0
# end process_row_to_db #####
#################################################################
# FUNCTION crawl_to_db.
# crawl the URLs, parse the data, validate and dump to a DB
#
#################################################################
def crawl_to_db(conn, crawl_url_queue):
rowcnt = 0
myheaders = {
'User-Agent': 'AdsTxtCrawler/1.0; +https://github.com/InteractiveAdvertisingBureau/adstxtcrawler',
'Accept': 'text/plain',
}
for aurl in crawl_url_queue:
ahost = crawl_url_queue[aurl]
logging.info(" Crawling %s : %s " % (aurl, ahost))
r = requests.get(aurl, headers=myheaders)
logging.info(" %d" % r.status_code)
if(r.status_code == 200):
logging.debug("-------------")
logging.debug(r.request.headers)
logging.debug("-------------")
logging.debug("%s" % r.text)
logging.debug("-------------")
tmpfile = 'tmpads.txt'
with open(tmpfile, 'wb') as tmp_csv_file:
tmp_csv_file.write(r.text)
tmp_csv_file.close()
with open(tmpfile, 'rb') as tmp_csv_file:
#read the line, split on first comment and keep what is to the left (if any found)
line_reader = csv.reader(tmp_csv_file, delimiter='#', quotechar='|')
comment = ''
for line in line_reader:
logging.debug("DATA: %s" % line)
try:
data_line = line[0]
except:
data_line = "";
#determine delimiter, conservative = do it per row
if data_line.find(",") != -1:
data_delimiter = ','
elif data_line.find("\t") != -1:
data_delimiter = '\t'
else:
data_delimiter = ' '
data_reader = csv.reader([data_line], delimiter=',', quotechar='|')
for row in data_reader:
if len(row) > 0 and row[0].startswith( '#' ):
continue
if (len(line) > 1) and (len(line[1]) > 0):
comment = line[1]
rowcnt = rowcnt + process_row_to_db(conn, row, comment, ahost)
return rowcnt
# end crawl_to_db #####
#################################################################
# FUNCTION load_url_queue
# Load the target set of URLs and reduce to an ads.txt domains queue
#
#################################################################
def load_url_queue(csvfilename, url_queue):
cnt = 0
with open(csvfilename, 'rb') as csvfile:
targets_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in targets_reader:
if len(row) < 1 or row[0].startswith( '#' ):
continue
for item in row:
host = "localhost"
if "http:" in item or "https:" in item :
logging.info( "URL: %s" % item)
parsed_uri = urlparse(row[0])
host = parsed_uri.netloc
else:
host = item
logging.info( "HOST: %s" % item)
skip = 0
try:
#print "Checking DNS: %s" % host
ip = socket.gethostbyname(host)
if "127.0.0" in ip:
skip = 0 #swap to 1 to skip localhost testing
elif "0.0.0.0" in ip:
skip = 1
else:
logging.info(" Validated Host IP: %s" % ip)
except:
skip = 1
if(skip < 1):
ads_txt_url = 'http://{thehost}/ads.txt'.format(thehost=host)
logging.info(" pushing %s" % ads_txt_url)
url_queue[ads_txt_url] = host
cnt = cnt + 1
return cnt
# end load_url_queue #####
#### MAIN ####
arg_parser = OptionParser()
arg_parser.add_option("-t", "--targets", dest="target_filename",
help="list of domains to crawl ads.txt from", metavar="FILE")
arg_parser.add_option("-d", "--database", dest="target_database",
help="Database to dump crawled data into", metavar="FILE")
arg_parser.add_option("-v", "--verbose", dest="verbose", action='count',
help="Increase verbosity (specify multiple times for more)")
(options, args) = arg_parser.parse_args()
if len(sys.argv)==1:
arg_parser.print_help()
exit(1)
log_level = logging.WARNING # default
if options.verbose == 1:
log_level = logging.INFO
elif options.verbose >= 2:
log_level = logging.DEBUG
logging.basicConfig(filename='adstxt_crawler.log',level=log_level,format='%(asctime)s %(filename)s:%(lineno)d:%(levelname)s %(message)s')
crawl_url_queue = {}
conn = None
cnt_urls = 0
cnt_records = 0
cnt_urls = load_url_queue(options.target_filename, crawl_url_queue)
if (cnt_urls > 0) and options.target_database and (len(options.target_database) > 1):
conn = sqlite3.connect(options.target_database)
with conn:
cnt_records = crawl_to_db(conn, crawl_url_queue)
if(cnt_records > 0):
conn.commit()
#conn.close()
print "Wrote %d records from %d URLs to %s" % (cnt_records, cnt_urls, options.target_database)
logging.warning("Wrote %d records from %d URLs to %s" % (cnt_records, cnt_urls, options.target_database))
logging.warning("Finished.")
我正在使用Python 2.7.9。 我试着用这个命令安装sqlite:
python -m pip install sqlite
我回来了:
下载/解压缩sqlite3无法找到任何下载 满足要求sqlite3正在清理...根本没有发行版 找到sqlite3在... \ pip.log
中存储失败的调试日志
第一步是这个命令:
$sqlite3 adstxt.db < adstxt_crawler.sql
我得到了这些:
“'sqlite3'未被识别为内部或外部命令,可操作程序或批处理文件。”
我知道这是非常基本的,但我没有找到任何相关的帮助,如果你能帮助我,我真的很感激。 感谢。
亚当
答案 0 :(得分:2)
第一个错误:
'sqlite3' is not recognized as an internal or external command, operable program or batch file.
是因为您尝试运行sqlite命令行工具,该工具未安装在您的系统上。 Python 3包含sqlite但不提供独立命令sqlite3
第二个错误是语法错误。在Python 3中,print是标准函数,因此必须与括号一起使用
print('hello world')
您可能尝试使用Python 3解释器运行python 2代码