我在Python中有以下代码。
"""
Module to encapsulate body parsing.
"""
from urlparse import urlparse
from bs4 import BeautifulSoup,Comment
import os
import shutil
from hct import utils
BASE_DIR = os.getcwd()
PAGE_SOURCE_CMD = 'phantomas %s --page-source'
FEO_PAGE_SOURCE_CMD = 'phantomjs RequestURL.js %s > body.html'
class Extractor(object):
"""
This file provides utility to do body parsing of an url.
"""
def __init__(self):
pass
def check_tags(self, tags, feed):
"""
Method: Method to handle the tags as encountered during parsing.
Also contains the business logic to check to prefetch and
preresolve DNS eanblement
Args: Takes the tag and its attributes as a list
Returns: A dictionary of tags and their values.
"""
result = {}
for tag in tags:
if len(feed.select('link[rel='+tag+']')) > 0:
result['link'] = tag
return result
def get_generated_html(self, url, has_headers):
"""
Method: Method to get the generated HTML content from Phantomas.
Args: Takes the url as an argument for which to get the HTML content.
hasHeaders defaulted to false for no headers.
Returns: Nothing.
"""
if not urlparse(url).scheme:
url = 'http://'+url
if has_headers == False:
command = PAGE_SOURCE_CMD % url
else:
command = FEO_PAGE_SOURCE_CMD % url
utils.execute_command(command).communicate()
def create_analysis_folder(self, analysis_id, has_headers):
if not os.path.exists(os.path.join(BASE_DIR, analysis_id)):
os.makedirs(os.path.join(BASE_DIR,analysis_id))
path = os.path.join(BASE_DIR, analysis_id, 'html')
if has_headers:
os.makedirs(path)
shutil.copy(os.path.join(BASE_DIR, "RequestURL.js"), path)
return path
"""
def create_analysis_folder(self, analysis_id, has_headers):
Method: To create a folder to fetch and analyse the HTML based on
analysis ID.
Args: Takes the Analsis ID as an argument.
Returns: The path to the created folder.
analysis_id = str(analysis_id)
path = None
if not os.path.exists(analysis_id):
os.makedirs(analysis_id)
os.chdir(analysis_id)
if has_headers == False:
path = os.getcwd() + '/html'
print path
return path
else:
print "coming here"
os.makedirs('html')
os.chdir('html')
shutil.copy("../../hct/data_processors/RequestURL.js", os.getcwd())
return os.getcwd()
"""
def start_parser(self, analysis_id, url, hasHeaders=False):
"""
Method: Method to start the parser.
Args: Analsyis ID and URL as an argument.
Returns: Nothing.
"""
feed = None
analysis_id = str(analysis_id)
path = self.create_analysis_folder(analysis_id, hasHeaders)
os.chdir(path)
self.get_generated_html(url, hasHeaders)
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.html'):
feed = BeautifulSoup(open(path + '/' +file).read())
print feed
if hasHeaders:
os.chdir('..')
#shutil.rmtree(path)
break
return feed
我根据某些条件创建文件夹,下载HTML源,然后在解析后必须删除文件夹。该模块将被多次调用。问题是它第一次正常工作。但第二次行#shutil.rmtree(path)
抛出错误No such file or directory
,因为它试图在不存在的目录中创建一个目录,该目录在第一次调用该方法时被删除。
我如何克服这个问题,因为我必须删除目录,否则它会泛滥服务器内存
答案 0 :(得分:1)
您可以使用os.mkdir()
在shutil.rmtree()
之后立即重新创建目录。
答案 1 :(得分:1)
如果shutil.rmtree(path)
为path
目录本身引发“没有此类文件或目录”异常,那么它可能是shutil.rmtree()
中的错误 - 它应该是很高兴有人为它做了工作 - 它应该忽略这些错误。
您可以编写一个解决问题的包装器:
import errno
import shutil
def rmtree(path):
try:
return shutil.rmtree(path)
except OSError as e:
if e.errno == errno.ENOENT and e.filename == path:
pass # path is gone already -- ignore the exception
else:
raise
您可以致电rmtree(path)
path
是否存在。
您也可以使用ignore_errors
, onerror
rmtree's parameters获得同样的效果。