我无法在我创建的python文件中加载函数。 python文件中的整个代码在Jupyter Notebook中运行没有任何问题。现在,我已将1:1的代码放入python文件中,我得到一个错误“无法从'connector'导入名称'get'”,其中connector是我的python文件,而get是嵌套在其中的函数。
我怀疑Jupyter Notebook的格式已更改并转换为常规python文件,从而以某种方式更改了文件的读取方式?
我想将代码移到python文件中,因为此功能对于我在抓取网页时在python中进行的多个项目很有用。我可以调用函数ratelimit函数,但是我似乎无法弄清楚我的get命令出了什么问题。
# Imports
import scraping_class
import pandas as pd
import requests,os,time
logfile="trustpilot.txt"
Connector = scraping_class.Connector(logfile)
def ratelimit(x):
"A function that handles the rate of your calls."
time.sleep(x) # sleep x seconds.
class Connector():
def __init__(self,logfile,overwrite_log=False,connector_type='requests',session=False,path2selenium='',n_tries = 5,timeout=30):
"""This Class implements a method for reliable connection to the internet and monitoring.
It handles simple errors due to connection problems, and logs a range of information for basic quality assessments
Keyword arguments:
logfile -- path to the logfile
overwrite_log -- bool, defining if logfile should be cleared (rarely the case).
connector_type -- use the 'requests' module or the 'selenium'. Will have different since the selenium webdriver does not have a similar response object when using the get method, and monitoring the behavior cannot be automated in the same way.
session -- requests.session object. For defining custom headers and proxies.
path2selenium -- str, sets the path to the geckodriver needed when using selenium.
n_tries -- int, defines the number of retries the *get* method will try to avoid random connection errors.
timeout -- int, seconds the get request will wait for the server to respond, again to avoid connection errors.
"""
## Initialization function defining parameters.
self.n_tries = n_tries # For avoiding triviel error e.g. connection errors, this defines how many times it will retry.
self.timeout = timeout # Defining the maximum time to wait for a server to response.
## not implemented here, if you use selenium.
if connector_type=='selenium':
assert path2selenium!='', "You need to specify the path to you geckodriver if you want to use Selenium"
from selenium import webdriver
## HIN download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases
assert os.path.isfile(path2selenium),'You need to insert a valid path2selenium the path to your geckodriver. You can download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases'
self.browser = webdriver.Firefox(executable_path=path2selenium) # start the browser with a path to the geckodriver.
self.connector_type = connector_type # set the connector_type
if session: # set the custom session
self.session = session
else:
self.session = requests.session()
self.logfilename = logfile # set the logfile path
## define header for the logfile
header = ['id','project','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
if os.path.isfile(logfile):
if overwrite_log==True:
self.log = open(logfile,'w')
self.log.write(';'.join(header))
else:
self.log = open(logfile,'a')
else:
self.log = open(logfile,'w')
self.log.write(';'.join(header))
## load log
with open(logfile,'r') as f: # open file
l = f.read().split('\n') # read and split file by newlines.
## set id
if len(l)<=1:
self.id = 0
else:
self.id = int(l[-1][0])+1
def get(self,url,project_name):
"""Method for connector reliably to the internet, with multiple tries and simple error handling, as well as default logging function.
Input url and the project name for the log (i.e. is it part of mapping the domain, or is it the part of the final stage in the data collection).
Keyword arguments:
url -- str, url
project_name -- str, Name used for analyzing the log. Use case could be the 'Mapping of domain','Meta_data_collection','main data collection'.
"""
project_name = project_name.replace(';','-') # make sure the default csv seperator is not in the project_name.
if self.connector_type=='requests': # Determine connector method.
for _ in range(self.n_tries): # for loop defining number of retries with the requests method.
ratelimit()
t = time.time()
try: # error handling
response = self.session.get(url,timeout = self.timeout) # make get call
err = '' # define python error variable as empty assumming success.
success = True # define success variable
redirect_url = response.url # log current url, after potential redirects
dt = t - time.time() # define delta-time waiting for the server and downloading content.
size = len(response.text) # define variable for size of html content of the response.
response_code = response.status_code # log status code.
## log...
call_id = self.id # get current unique identifier for the call
self.id+=1 # increment call id
#['id','project_name','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row to be written in the log.
self.log.write('\n'+';'.join(map(str,row))) # write log.
return response,call_id # return response and unique identifier.
except Exception as e: # define error condition
err = str(e) # python error
response_code = '' # blank response code
success = False # call success = False
size = 0 # content is empty.
redirect_url = '' # redirect url empty
dt = t - time.time() # define delta t
## log...
call_id = self.id # define unique identifier
self.id+=1 # increment call_id
row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row
self.log.write('\n'+';'.join(map(str,row))) # write row to log.
else:
t = time.time()
ratelimit()
self.browser.get(url) # use selenium get method
## log
call_id = self.id # define unique identifier for the call.
self.id+=1 # increment the call_id
err = '' # blank error message
success = '' # success blank
redirect_url = self.browser.current_url # redirect url.
dt = t - time.time() # get time for get method ... NOTE: not necessarily the complete load time.
size = len(self.browser.page_source) # get size of content ... NOTE: not necessarily correct, since selenium works in the background, and could still be loading.
response_code = '' # empty response code.
row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row
self.log.write('\n'+';'.join(map(str,row))) # write row to log file.
# Using selenium it will not return a response object, instead you should call the browser object of the connector.
## connector.browser.page_source will give you the html.
return call_id
logfile="trustpilot.txt" ## name your log file.
connector = Connector(logfile)
我希望能够从'connector.py'加载我的'get'函数
答案 0 :(得分:0)
由于连接器变量和连接器类的命名冲突,以下行可能是问题:
Connector = scraping_class.Connector(logfile)