我需要很长时间(需要80万件物品)刮擦过程才能在完成之前阻挡
首先,我想为这个长期的问题道歉,有很多代码...我希望尽可能多地给你信息。
我尝试了很多东西,阅读了很多其他的东西,但我无法解决这个日志所显示的问题:
Prelude> :browse Prelude
(!!) :: [a] -> Int -> a
($) ::
forall (r :: GHC.Types.RuntimeRep) a (b :: TYPE r).
(a -> b) -> a -> b
($!) :: (a -> b) -> a -> b
(&&) :: Bool -> Bool -> Bool
(++) :: [a] -> [a] -> [a]
(.) :: (b -> c) -> (a -> b) -> a -> c
(<$>) :: Functor f => (a -> b) -> f a -> f b
(=<<) :: Monad m => (a -> m b) -> m a -> m b
class Functor f => Applicative (f :: * -> *) where
pure :: a -> f a
(<*>) :: f (a -> b) -> f a -> f b
(*>) :: f a -> f b -> f b
(<*) :: f a -> f b -> f a
{-# MINIMAL pure, (<*>) #-}
data Bool = False | True
class Bounded a where
minBound :: a
maxBound :: a
{-# MINIMAL minBound, maxBound #-}
data Char = GHC.Types.C# GHC.Prim.Char#
data Double = GHC.Types.D# GHC.Prim.Double#
data Either a b = Left a | Right b
class Enum a where
succ :: a -> a
pred :: a -> a
toEnum :: Int -> a
fromEnum :: a -> Int
enumFrom :: a -> [a]
enumFromThen :: a -> a -> [a]
enumFromTo :: a -> a -> [a]
enumFromThenTo :: a -> a -> a -> [a]
{-# MINIMAL toEnum, fromEnum #-}
class Eq a where
(==) :: a -> a -> Bool
(/=) :: a -> a -> Bool
{-# MINIMAL (==) | (/=) #-}
type FilePath = String
data Float = GHC.Types.F# GHC.Prim.Float#
class Fractional a => Floating a where
pi :: a
exp :: a -> a
log :: a -> a
sqrt :: a -> a
(**) :: a -> a -> a
logBase :: a -> a -> a
sin :: a -> a
cos :: a -> a
tan :: a -> a
asin :: a -> a
acos :: a -> a
atan :: a -> a
sinh :: a -> a
cosh :: a -> a
tanh :: a -> a
asinh :: a -> a
acosh :: a -> a
atanh :: a -> a
GHC.Float.log1p :: a -> a
GHC.Float.expm1 :: a -> a
GHC.Float.log1pexp :: a -> a
GHC.Float.log1mexp :: a -> a
{-# MINIMAL pi, exp, log, sin, cos, asin, acos, atan, sinh, cosh,
asinh, acosh, atanh #-}
class Foldable (t :: * -> *) where
Data.Foldable.fold :: Monoid m => t m -> m
foldMap :: Monoid m => (a -> m) -> t a -> m
foldr :: (a -> b -> b) -> b -> t a -> b
Data.Foldable.foldr' :: (a -> b -> b) -> b -> t a -> b
foldl :: (b -> a -> b) -> b -> t a -> b
Data.Foldable.foldl' :: (b -> a -> b) -> b -> t a -> b
foldr1 :: (a -> a -> a) -> t a -> a
foldl1 :: (a -> a -> a) -> t a -> a
Data.Foldable.toList :: t a -> [a]
null :: t a -> Bool
length :: t a -> Int
elem :: Eq a => a -> t a -> Bool
maximum :: Ord a => t a -> a
minimum :: Ord a => t a -> a
sum :: Num a => t a -> a
product :: Num a => t a -> a
{-# MINIMAL foldMap | foldr #-}
class Num a => Fractional a where
(/) :: a -> a -> a
recip :: a -> a
fromRational :: Rational -> a
{-# MINIMAL fromRational, (recip | (/)) #-}
class Functor (f :: * -> *) where
fmap :: (a -> b) -> f a -> f b
(<$) :: a -> f b -> f a
{-# MINIMAL fmap #-}
newtype IO a
= GHC.Types.IO (GHC.Prim.State# GHC.Prim.RealWorld
-> (# GHC.Prim.State# GHC.Prim.RealWorld, a #))
type IOError = GHC.IO.Exception.IOException
data Int = GHC.Types.I# GHC.Prim.Int#
data Integer
= integer-gmp-1.0.0.1:GHC.Integer.Type.S# !GHC.Prim.Int#
| integer-gmp-1.0.0.1:GHC.Integer.Type.Jp# {-# UNPACK #-}integer-gmp-1.0.0.1:GHC.Integer.Type.BigNat
| integer-gmp-1.0.0.1:GHC.Integer.Type.Jn# {-# UNPACK #-}integer-gmp-1.0.0.1:GHC.Integer.Type.BigNat
class (Real a, Enum a) => Integral a where
quot :: a -> a -> a
rem :: a -> a -> a
div :: a -> a -> a
mod :: a -> a -> a
quotRem :: a -> a -> (a, a)
divMod :: a -> a -> (a, a)
toInteger :: a -> Integer
{-# MINIMAL quotRem, toInteger #-}
data Maybe a = Nothing | Just a
class Applicative m => Monad (m :: * -> *) where
(>>=) :: m a -> (a -> m b) -> m b
(>>) :: m a -> m b -> m b
return :: a -> m a
fail :: String -> m a
{-# MINIMAL (>>=) #-}
class Monoid a where
mempty :: a
mappend :: a -> a -> a
mconcat :: [a] -> a
{-# MINIMAL mempty, mappend #-}
class Num a where
(+) :: a -> a -> a
(-) :: a -> a -> a
(*) :: a -> a -> a
negate :: a -> a
abs :: a -> a
signum :: a -> a
fromInteger :: Integer -> a
{-# MINIMAL (+), (*), abs, signum, fromInteger, (negate | (-)) #-}
class Eq a => Ord a where
compare :: a -> a -> Ordering
(<) :: a -> a -> Bool
(<=) :: a -> a -> Bool
(>) :: a -> a -> Bool
(>=) :: a -> a -> Bool
max :: a -> a -> a
min :: a -> a -> a
{-# MINIMAL compare | (<=) #-}
data Ordering = LT | EQ | GT
type Rational = GHC.Real.Ratio Integer
class Read a where
readsPrec :: Int -> ReadS a
readList :: ReadS [a]
GHC.Read.readPrec :: Text.ParserCombinators.ReadPrec.ReadPrec a
GHC.Read.readListPrec :: Text.ParserCombinators.ReadPrec.ReadPrec
[a]
{-# MINIMAL readsPrec | readPrec #-}
type ReadS a = String -> [(a, String)]
class (Num a, Ord a) => Real a where
toRational :: a -> Rational
{-# MINIMAL toRational #-}
class (RealFrac a, Floating a) => RealFloat a where
floatRadix :: a -> Integer
floatDigits :: a -> Int
floatRange :: a -> (Int, Int)
decodeFloat :: a -> (Integer, Int)
encodeFloat :: Integer -> Int -> a
exponent :: a -> Int
significand :: a -> a
scaleFloat :: Int -> a -> a
isNaN :: a -> Bool
isInfinite :: a -> Bool
isDenormalized :: a -> Bool
isNegativeZero :: a -> Bool
isIEEE :: a -> Bool
atan2 :: a -> a -> a
{-# MINIMAL floatRadix, floatDigits, floatRange, decodeFloat,
encodeFloat, isNaN, isInfinite, isDenormalized, isNegativeZero,
isIEEE #-}
class (Real a, Fractional a) => RealFrac a where
properFraction :: Integral b => a -> (b, a)
truncate :: Integral b => a -> b
round :: Integral b => a -> b
ceiling :: Integral b => a -> b
floor :: Integral b => a -> b
{-# MINIMAL properFraction #-}
class Show a where
showsPrec :: Int -> a -> ShowS
show :: a -> String
showList :: [a] -> ShowS
{-# MINIMAL showsPrec | show #-}
type ShowS = String -> String
type String = [Char]
class (Functor t, Foldable t) => Traversable (t :: * -> *) where
traverse :: Applicative f => (a -> f b) -> t a -> f (t b)
sequenceA :: Applicative f => t (f a) -> f (t a)
mapM :: Monad m => (a -> m b) -> t a -> m (t b)
sequence :: Monad m => t (m a) -> m (t a)
{-# MINIMAL traverse | sequenceA #-}
data Word = GHC.Types.W# GHC.Prim.Word#
(^) :: (Num a, Integral b) => a -> b -> a
(^^) :: (Fractional a, Integral b) => a -> b -> a
all :: Foldable t => (a -> Bool) -> t a -> Bool
and :: Foldable t => t Bool -> Bool
any :: Foldable t => (a -> Bool) -> t a -> Bool
appendFile :: FilePath -> String -> IO ()
asTypeOf :: a -> a -> a
break :: (a -> Bool) -> [a] -> ([a], [a])
concat :: Foldable t => t [a] -> [a]
concatMap :: Foldable t => (a -> [b]) -> t a -> [b]
const :: a -> b -> a
curry :: ((a, b) -> c) -> a -> b -> c
cycle :: [a] -> [a]
drop :: Int -> [a] -> [a]
dropWhile :: (a -> Bool) -> [a] -> [a]
either :: (a -> c) -> (b -> c) -> Either a b -> c
error ::
forall (r :: GHC.Types.RuntimeRep) (a :: TYPE r).
GHC.Stack.Types.HasCallStack =>
[Char] -> a
errorWithoutStackTrace ::
forall (r :: GHC.Types.RuntimeRep) (a :: TYPE r). [Char] -> a
even :: Integral a => a -> Bool
filter :: (a -> Bool) -> [a] -> [a]
flip :: (a -> b -> c) -> b -> a -> c
fromIntegral :: (Integral a, Num b) => a -> b
fst :: (a, b) -> a
gcd :: Integral a => a -> a -> a
getChar :: IO Char
getContents :: IO String
getLine :: IO String
head :: [a] -> a
id :: a -> a
init :: [a] -> [a]
interact :: (String -> String) -> IO ()
ioError :: IOError -> IO a
iterate :: (a -> a) -> a -> [a]
last :: [a] -> a
lcm :: Integral a => a -> a -> a
lex :: ReadS String
lines :: String -> [String]
lookup :: Eq a => a -> [(a, b)] -> Maybe b
map :: (a -> b) -> [a] -> [b]
mapM_ :: (Foldable t, Monad m) => (a -> m b) -> t a -> m ()
maybe :: b -> (a -> b) -> Maybe a -> b
not :: Bool -> Bool
notElem :: (Foldable t, Eq a) => a -> t a -> Bool
odd :: Integral a => a -> Bool
or :: Foldable t => t Bool -> Bool
otherwise :: Bool
print :: Show a => a -> IO ()
putChar :: Char -> IO ()
putStr :: String -> IO ()
putStrLn :: String -> IO ()
read :: Read a => String -> a
readFile :: FilePath -> IO String
readIO :: Read a => String -> IO a
readLn :: Read a => IO a
readParen :: Bool -> ReadS a -> ReadS a
reads :: Read a => ReadS a
realToFrac :: (Real a, Fractional b) => a -> b
repeat :: a -> [a]
replicate :: Int -> a -> [a]
reverse :: [a] -> [a]
scanl :: (b -> a -> b) -> b -> [a] -> [b]
scanl1 :: (a -> a -> a) -> [a] -> [a]
scanr :: (a -> b -> b) -> b -> [a] -> [b]
scanr1 :: (a -> a -> a) -> [a] -> [a]
seq :: a -> b -> b
sequence_ :: (Foldable t, Monad m) => t (m a) -> m ()
showChar :: Char -> ShowS
showParen :: Bool -> ShowS -> ShowS
showString :: String -> ShowS
shows :: Show a => a -> ShowS
snd :: (a, b) -> b
span :: (a -> Bool) -> [a] -> ([a], [a])
splitAt :: Int -> [a] -> ([a], [a])
subtract :: Num a => a -> a -> a
tail :: [a] -> [a]
take :: Int -> [a] -> [a]
takeWhile :: (a -> Bool) -> [a] -> [a]
uncurry :: (a -> b -> c) -> (a, b) -> c
undefined ::
forall (r :: GHC.Types.RuntimeRep) (a :: TYPE r).
GHC.Stack.Types.HasCallStack =>
a
unlines :: [String] -> String
until :: (a -> Bool) -> (a -> a) -> a -> a
unwords :: [String] -> String
unzip :: [(a, b)] -> ([a], [b])
unzip3 :: [(a, b, c)] -> ([a], [b], [c])
userError :: String -> IOError
words :: String -> [String]
writeFile :: FilePath -> String -> IO ()
zip :: [a] -> [b] -> [(a, b)]
zip3 :: [a] -> [b] -> [c] -> [(a, b, c)]
zipWith :: (a -> b -> c) -> [a] -> [b] -> [c]
zipWith3 :: (a -> b -> c -> d) -> [a] -> [b] -> [c] -> [d]
(||) :: Bool -> Bool -> Bool
在[... everything works well ...]
2017-08-02 12:35:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&email=...&tool=...&query_key=1&webenv=NCID_1_167674051_130.14.18.34_9001_1501674632_1806773429_0MetA0_S_MegaStore_F_1&retmode=xml&retstart=664200&retmax=100>
{'abstract': None,
'authors': [{'affiliation_info': None, 'first_name': u'I', 'last_name': u'Kimura'},
{'affiliation_info': None, 'first_name': u'T', 'last_name': u'Sugiyama'},
{'affiliation_info': None, 'first_name': u'Y', 'last_name': u'Ito'}],
'doi': None,
'journal': u'Proceedings of the Society for Experimental Biology and Medicine. Society for Experimental Biology and Medicine (New York, N.Y.)',
'keywords': [],
'publication_year': u'1967',
'scrape_session_id': 1,
'title': u'Papillomatous growth in sole from Wakasa Bay area.',
'url': u'https://www.ncbi.nlm.nih.gov/pubmed/6027520'}
2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Prepare item.
2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Check if article already exist.
2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Begin authors processing.
2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Get or create author.
2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Get or create author.
2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Begin keywords processing.
2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Get or create journal.
2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Commit article.
2017-08-02 12:35:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&email=...&tool=...&query_key=1&webenv=NCID_1_167674051_130.14.18.34_9001_1501674632_1806773429_0MetA0_S_MegaStore_F_1&retmode=xml&retstart=664200&retmax=100>
{'abstract': None,
'authors': [{'affiliation_info': None, 'first_name': u'V H', 'last_name': u'Donaldson'},
{'affiliation_info': None, 'first_name': u'O D', 'last_name': u'Ratnoff'}],
'doi': None,
'journal': u'Proceedings of the Society for Experimental Biology and Medicine. Society for Experimental Biology and Medicine (New York, N.Y.)',
'keywords': [],
'publication_year': u'1967',
'scrape_session_id': 1,
'title': u'Effect of some analogues of bradykinin upon vascular permeability.',
'url': u'https://www.ncbi.nlm.nih.gov/pubmed/6027514'}
2017-08-02 12:36:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 501 items/min)
2017-08-02 12:37:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 0 items/min)
2017-08-02 12:38:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 0 items/min)
2017-08-02 12:39:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 0 items/min)
2017-08-02 12:40:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 0 items/min)
2017-08-02 12:41:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 0 items/min)
[... infinite ...]
之后,python进程已经运行但没有任何反应:
以下是TelnetConsole的一些信息(在阻止之后):
2017-08-02 12:36:32
这是我的代码:
蜘蛛
>>> est()
Execution engine status
time()-engine.start_time : 8502.19854903
engine.has_capacity() : False
len(engine.downloader.active) : 16
engine.scraper.is_idle() : False
engine.spider.name : pubmed_spider
engine.spider_is_idle(engine.spider) : False
engine.slot.closing : False
len(engine.slot.inprogress) : 16
len(engine.slot.scheduler.dqs or []) : 0
len(engine.slot.scheduler.mqs) : 6605
len(engine.scraper.slot.queue) : 0
len(engine.scraper.slot.active) : 0
engine.scraper.slot.active_size : 0
engine.scraper.slot.itemproc_size : 0
engine.scraper.slot.needs_backout() : False
>>> stats.get_stats()
{
'spider_name': 'Pubmed',
'memusage/startup': 63430656,
'scrape_session_id': 1,
'log_count/INFO': 148,
'downloader/response_count': 506,
'downloader/response_bytes': 20485075,
'item_dropped_count': 2182,
'item_dropped_reasons_count/DropItem': 2182,
'scrape_session_query': 'skin',
'log_count/DEBUG': 582414,
'scheduler/dequeued': 522,
'log_count/WARNING': 2183,
'request_depth_max': 1,
'start_time': datetime.datetime(2017, 8, 2, 11, 50, 32, 57921),
'downloader/request_method_count/GET': 522,
'log_count/CRITICAL': 2,
'memusage/max': 177164288,
'downloader/request_bytes': 371843,
'downloader/response_status_count/200': 506,
'response_received_count': 506,
'scheduler/enqueued/memory': 7127,
'item_scraped_count': 48229,
'scheduler/dequeued/memory': 522,
'scheduler/enqueued': 7127,
'downloader/request_count': 522
}
管道
class PubmedSpider(Spider):
name = 'pubmed_spider'
pubmed_email = '...'
pubmed_tool = '...'
allowed_domains = ['eutils.ncbi.nlm.nih.gov']
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils'
base_params = 'db=pubmed&email=%s&tool=%s' % (pubmed_email, pubmed_tool)
root_itertag = 'PubmedArticle'
namespaces = ()
webenv = None
scrape_session_id = None
source_total_count = 0
ret_max = 100
def __init__(self, query=None, source=None, system_session=None, publi_sci_session=None, *args, **kwargs):
super(PubmedSpider, self).__init__(*args, **kwargs)
self.query = query
self.source = source
self.system_session = system_session
self.publi_sci_session = publi_sci_session
def get_esearch_url(self):
return '%s/esearch.fcgi?%s&term=%s&retmax=1&usehistory=y%s' % \
(self.base_url,
self.base_params,
self.query.query,
'&webenv=%s' % self.webenv if self.webenv else '')
def get_efetch_url(self, query_key, retstart, retmax):
return '%s/efetch.fcgi?%s&query_key=%s&webenv=%s&retmode=xml&retstart=%s&retmax=%s' %\
(self.base_url,
self.base_params,
query_key,
self.webenv,
retstart,
retmax)
def get_last_crawl_session(self, query):
return self.system_session\
.query(func.sum(ScrapeSession.items_scraped + ScrapeSession.items_dropped).label("total_items_scraped"))\
.join(ScrapeSession.queries)\
.filter(ScrapeSession.source == self.source)\
.filter(Query.id == query.id)\
.first()
def start_requests(self):
return [Request(self.get_esearch_url(), callback=self.find_retmax)]
def find_retmax(self, response):
self.webenv = response.selector.xpath('/eSearchResult/WebEnv/text()').extract()[0]
count = int(response.selector.xpath('/eSearchResult/Count/text()').extract()[0])
query_key = response.selector.xpath('/eSearchResult/QueryKey/text()').extract()[0]
last_session = self.get_last_crawl_session(self.query)
self.source_total_count = count
if not last_session.total_items_scraped:
retmax = self.ret_max
else:
retmax = count - last_session.total_items_scraped
if retmax is not None and retmax < 1:
raise CloseSpider("No new publications for '%s'." % self.query.query)
retstart = last_session.total_items_scraped
offsets = [retstart]
while retstart < count - self.ret_max:
retstart += self.ret_max
offsets.append(retstart)
for offset in offsets:
efetch_url = self.get_efetch_url(query_key, offset, self.ret_max)
yield Request(efetch_url, callback=self.parse)
def parse_article(self, response, node):
authors_nodes = node.xpath('MedlineCitation/Article/AuthorList/Author')
authors = self.parse_tag(response, authors_nodes, self.parse_author)
loader = ItemLoader(item=PubmedArticleItem(), selector=node)
loader.add_xpath('title', 'MedlineCitation/Article/ArticleTitle/text()')
loader.add_xpath('title', 'MedlineCitation/Article/VernacularTitle/text()')
loader.add_xpath('abstract', 'MedlineCitation/Article/Abstract/AbstractText/text()')
loader.add_xpath('publication_year', 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year/text()')
loader.add_xpath('publication_year', 'MedlineCitation/DateCreated/Year/text()')
loader.add_xpath('publication_year', 'MedlineCitation/DateRevised/Year/text()')
loader.add_xpath('doi', 'PubmedData/ArticleIdList/ArticleId[@IdType="doi"]/text()')
loader.add_xpath('doi', 'MedlineCitation/Article/ELocationID[@EIdType="doi"]/text()')
loader.add_xpath('journal', 'MedlineCitation/Article/Journal/Title/text()')
loader.add_xpath('keywords', 'MedlineCitation/KeywordList/Keyword/text()')
loader.add_value('url', 'https://www.ncbi.nlm.nih.gov/pubmed/')
loader.add_xpath('url', 'MedlineCitation/PMID/text()')
loader.add_value('authors', authors)
loader.add_value('scrape_session_id', self.scrape_session_id)
yield loader.load_item()
def parse_author(self, response, node):
loader = ItemLoader(item=PubmedAuthorItem(), selector=node)
loader.add_xpath('last_name', 'LastName/text()')
loader.add_xpath('first_name', 'ForeName/text()')
loader.add_xpath('affiliation_info', 'AffiliationInfo/Affiliation/text()')
return loader.load_item()
def parse_tag(self, response, nodes, callback):
results = []
for selector in nodes:
results.append(callback(response, selector))
return results
def parse_nodes(self, response, nodes, callback):
for selector in nodes:
for result_item in iterate_spider_output(callback(response, selector)):
yield result_item
def parse(self, response, itertag=root_itertag):
nodes = self._iternodes(response, itertag)
return self.parse_nodes(response, nodes, self.parse_article)
def _iternodes(self, response, itertag):
for node in xmliter(response, itertag):
self._register_namespaces(node)
yield node
def _register_namespaces(self, selector):
for (prefix, uri) in self.namespaces:
selector.register_namespace(prefix, uri)
如果你们有任何想法,那真的很棒。 :)
提前谢谢!