在刮擦过程中Scrapy被阻止

时间:2017-08-02 14:28:00

标签: python mysql web-scraping scrapy blocking

我需要很长时间(需要80万件物品)刮擦过程才能在完成之前阻挡 首先,我想为这个长期的问题道歉,有很多代码...我希望尽可能多地给你信息。

我尝试了很多东西,阅读了很多其他的东西,但我无法解决这个日志所显示的问题:

Prelude> :browse Prelude
(!!) :: [a] -> Int -> a
($) ::
  forall (r :: GHC.Types.RuntimeRep) a (b :: TYPE r).
  (a -> b) -> a -> b
($!) :: (a -> b) -> a -> b
(&&) :: Bool -> Bool -> Bool
(++) :: [a] -> [a] -> [a]
(.) :: (b -> c) -> (a -> b) -> a -> c
(<$>) :: Functor f => (a -> b) -> f a -> f b
(=<<) :: Monad m => (a -> m b) -> m a -> m b
class Functor f => Applicative (f :: * -> *) where
  pure :: a -> f a
  (<*>) :: f (a -> b) -> f a -> f b
  (*>) :: f a -> f b -> f b
  (<*) :: f a -> f b -> f a
  {-# MINIMAL pure, (<*>) #-}
data Bool = False | True
class Bounded a where
  minBound :: a
  maxBound :: a
  {-# MINIMAL minBound, maxBound #-}
data Char = GHC.Types.C# GHC.Prim.Char#
data Double = GHC.Types.D# GHC.Prim.Double#
data Either a b = Left a | Right b
class Enum a where
  succ :: a -> a
  pred :: a -> a
  toEnum :: Int -> a
  fromEnum :: a -> Int
  enumFrom :: a -> [a]
  enumFromThen :: a -> a -> [a]
  enumFromTo :: a -> a -> [a]
  enumFromThenTo :: a -> a -> a -> [a]
  {-# MINIMAL toEnum, fromEnum #-}
class Eq a where
  (==) :: a -> a -> Bool
  (/=) :: a -> a -> Bool
  {-# MINIMAL (==) | (/=) #-}
type FilePath = String
data Float = GHC.Types.F# GHC.Prim.Float#
class Fractional a => Floating a where
  pi :: a
  exp :: a -> a
  log :: a -> a
  sqrt :: a -> a
  (**) :: a -> a -> a
  logBase :: a -> a -> a
  sin :: a -> a
  cos :: a -> a
  tan :: a -> a
  asin :: a -> a
  acos :: a -> a
  atan :: a -> a
  sinh :: a -> a
  cosh :: a -> a
  tanh :: a -> a
  asinh :: a -> a
  acosh :: a -> a
  atanh :: a -> a
  GHC.Float.log1p :: a -> a
  GHC.Float.expm1 :: a -> a
  GHC.Float.log1pexp :: a -> a
  GHC.Float.log1mexp :: a -> a
  {-# MINIMAL pi, exp, log, sin, cos, asin, acos, atan, sinh, cosh,
              asinh, acosh, atanh #-}
class Foldable (t :: * -> *) where
  Data.Foldable.fold :: Monoid m => t m -> m
  foldMap :: Monoid m => (a -> m) -> t a -> m
  foldr :: (a -> b -> b) -> b -> t a -> b
  Data.Foldable.foldr' :: (a -> b -> b) -> b -> t a -> b
  foldl :: (b -> a -> b) -> b -> t a -> b
  Data.Foldable.foldl' :: (b -> a -> b) -> b -> t a -> b
  foldr1 :: (a -> a -> a) -> t a -> a
  foldl1 :: (a -> a -> a) -> t a -> a
  Data.Foldable.toList :: t a -> [a]
  null :: t a -> Bool
  length :: t a -> Int
  elem :: Eq a => a -> t a -> Bool
  maximum :: Ord a => t a -> a
  minimum :: Ord a => t a -> a
  sum :: Num a => t a -> a
  product :: Num a => t a -> a
  {-# MINIMAL foldMap | foldr #-}
class Num a => Fractional a where
  (/) :: a -> a -> a
  recip :: a -> a
  fromRational :: Rational -> a
  {-# MINIMAL fromRational, (recip | (/)) #-}
class Functor (f :: * -> *) where
  fmap :: (a -> b) -> f a -> f b
  (<$) :: a -> f b -> f a
  {-# MINIMAL fmap #-}
newtype IO a
  = GHC.Types.IO (GHC.Prim.State# GHC.Prim.RealWorld
                  -> (# GHC.Prim.State# GHC.Prim.RealWorld, a #))
type IOError = GHC.IO.Exception.IOException
data Int = GHC.Types.I# GHC.Prim.Int#
data Integer
  = integer-gmp-1.0.0.1:GHC.Integer.Type.S# !GHC.Prim.Int#
  | integer-gmp-1.0.0.1:GHC.Integer.Type.Jp# {-# UNPACK #-}integer-gmp-1.0.0.1:GHC.Integer.Type.BigNat
  | integer-gmp-1.0.0.1:GHC.Integer.Type.Jn# {-# UNPACK #-}integer-gmp-1.0.0.1:GHC.Integer.Type.BigNat
class (Real a, Enum a) => Integral a where
  quot :: a -> a -> a
  rem :: a -> a -> a
  div :: a -> a -> a
  mod :: a -> a -> a
  quotRem :: a -> a -> (a, a)
  divMod :: a -> a -> (a, a)
  toInteger :: a -> Integer
  {-# MINIMAL quotRem, toInteger #-}
data Maybe a = Nothing | Just a
class Applicative m => Monad (m :: * -> *) where
  (>>=) :: m a -> (a -> m b) -> m b
  (>>) :: m a -> m b -> m b
  return :: a -> m a
  fail :: String -> m a
  {-# MINIMAL (>>=) #-}
class Monoid a where
  mempty :: a
  mappend :: a -> a -> a
  mconcat :: [a] -> a
  {-# MINIMAL mempty, mappend #-}
class Num a where
  (+) :: a -> a -> a
  (-) :: a -> a -> a
  (*) :: a -> a -> a
  negate :: a -> a
  abs :: a -> a
  signum :: a -> a
  fromInteger :: Integer -> a
  {-# MINIMAL (+), (*), abs, signum, fromInteger, (negate | (-)) #-}
class Eq a => Ord a where
  compare :: a -> a -> Ordering
  (<) :: a -> a -> Bool
  (<=) :: a -> a -> Bool
  (>) :: a -> a -> Bool
  (>=) :: a -> a -> Bool
  max :: a -> a -> a
  min :: a -> a -> a
  {-# MINIMAL compare | (<=) #-}
data Ordering = LT | EQ | GT
type Rational = GHC.Real.Ratio Integer
class Read a where
  readsPrec :: Int -> ReadS a
  readList :: ReadS [a]
  GHC.Read.readPrec :: Text.ParserCombinators.ReadPrec.ReadPrec a
  GHC.Read.readListPrec :: Text.ParserCombinators.ReadPrec.ReadPrec
                             [a]
  {-# MINIMAL readsPrec | readPrec #-}
type ReadS a = String -> [(a, String)]
class (Num a, Ord a) => Real a where
  toRational :: a -> Rational
  {-# MINIMAL toRational #-}
class (RealFrac a, Floating a) => RealFloat a where
  floatRadix :: a -> Integer
  floatDigits :: a -> Int
  floatRange :: a -> (Int, Int)
  decodeFloat :: a -> (Integer, Int)
  encodeFloat :: Integer -> Int -> a
  exponent :: a -> Int
  significand :: a -> a
  scaleFloat :: Int -> a -> a
  isNaN :: a -> Bool
  isInfinite :: a -> Bool
  isDenormalized :: a -> Bool
  isNegativeZero :: a -> Bool
  isIEEE :: a -> Bool
  atan2 :: a -> a -> a
  {-# MINIMAL floatRadix, floatDigits, floatRange, decodeFloat,
              encodeFloat, isNaN, isInfinite, isDenormalized, isNegativeZero,
              isIEEE #-}
class (Real a, Fractional a) => RealFrac a where
  properFraction :: Integral b => a -> (b, a)
  truncate :: Integral b => a -> b
  round :: Integral b => a -> b
  ceiling :: Integral b => a -> b
  floor :: Integral b => a -> b
  {-# MINIMAL properFraction #-}
class Show a where
  showsPrec :: Int -> a -> ShowS
  show :: a -> String
  showList :: [a] -> ShowS
  {-# MINIMAL showsPrec | show #-}
type ShowS = String -> String
type String = [Char]
class (Functor t, Foldable t) => Traversable (t :: * -> *) where
  traverse :: Applicative f => (a -> f b) -> t a -> f (t b)
  sequenceA :: Applicative f => t (f a) -> f (t a)
  mapM :: Monad m => (a -> m b) -> t a -> m (t b)
  sequence :: Monad m => t (m a) -> m (t a)
  {-# MINIMAL traverse | sequenceA #-}
data Word = GHC.Types.W# GHC.Prim.Word#
(^) :: (Num a, Integral b) => a -> b -> a
(^^) :: (Fractional a, Integral b) => a -> b -> a
all :: Foldable t => (a -> Bool) -> t a -> Bool
and :: Foldable t => t Bool -> Bool
any :: Foldable t => (a -> Bool) -> t a -> Bool
appendFile :: FilePath -> String -> IO ()
asTypeOf :: a -> a -> a
break :: (a -> Bool) -> [a] -> ([a], [a])
concat :: Foldable t => t [a] -> [a]
concatMap :: Foldable t => (a -> [b]) -> t a -> [b]
const :: a -> b -> a
curry :: ((a, b) -> c) -> a -> b -> c
cycle :: [a] -> [a]
drop :: Int -> [a] -> [a]
dropWhile :: (a -> Bool) -> [a] -> [a]
either :: (a -> c) -> (b -> c) -> Either a b -> c
error ::
  forall (r :: GHC.Types.RuntimeRep) (a :: TYPE r).
  GHC.Stack.Types.HasCallStack =>
  [Char] -> a
errorWithoutStackTrace ::
  forall (r :: GHC.Types.RuntimeRep) (a :: TYPE r). [Char] -> a
even :: Integral a => a -> Bool
filter :: (a -> Bool) -> [a] -> [a]
flip :: (a -> b -> c) -> b -> a -> c
fromIntegral :: (Integral a, Num b) => a -> b
fst :: (a, b) -> a
gcd :: Integral a => a -> a -> a
getChar :: IO Char
getContents :: IO String
getLine :: IO String
head :: [a] -> a
id :: a -> a
init :: [a] -> [a]
interact :: (String -> String) -> IO ()
ioError :: IOError -> IO a
iterate :: (a -> a) -> a -> [a]
last :: [a] -> a
lcm :: Integral a => a -> a -> a
lex :: ReadS String
lines :: String -> [String]
lookup :: Eq a => a -> [(a, b)] -> Maybe b
map :: (a -> b) -> [a] -> [b]
mapM_ :: (Foldable t, Monad m) => (a -> m b) -> t a -> m ()
maybe :: b -> (a -> b) -> Maybe a -> b
not :: Bool -> Bool
notElem :: (Foldable t, Eq a) => a -> t a -> Bool
odd :: Integral a => a -> Bool
or :: Foldable t => t Bool -> Bool
otherwise :: Bool
print :: Show a => a -> IO ()
putChar :: Char -> IO ()
putStr :: String -> IO ()
putStrLn :: String -> IO ()
read :: Read a => String -> a
readFile :: FilePath -> IO String
readIO :: Read a => String -> IO a
readLn :: Read a => IO a
readParen :: Bool -> ReadS a -> ReadS a
reads :: Read a => ReadS a
realToFrac :: (Real a, Fractional b) => a -> b
repeat :: a -> [a]
replicate :: Int -> a -> [a]
reverse :: [a] -> [a]
scanl :: (b -> a -> b) -> b -> [a] -> [b]
scanl1 :: (a -> a -> a) -> [a] -> [a]
scanr :: (a -> b -> b) -> b -> [a] -> [b]
scanr1 :: (a -> a -> a) -> [a] -> [a]
seq :: a -> b -> b
sequence_ :: (Foldable t, Monad m) => t (m a) -> m ()
showChar :: Char -> ShowS
showParen :: Bool -> ShowS -> ShowS
showString :: String -> ShowS
shows :: Show a => a -> ShowS
snd :: (a, b) -> b
span :: (a -> Bool) -> [a] -> ([a], [a])
splitAt :: Int -> [a] -> ([a], [a])
subtract :: Num a => a -> a -> a
tail :: [a] -> [a]
take :: Int -> [a] -> [a]
takeWhile :: (a -> Bool) -> [a] -> [a]
uncurry :: (a -> b -> c) -> (a, b) -> c
undefined ::
  forall (r :: GHC.Types.RuntimeRep) (a :: TYPE r).
  GHC.Stack.Types.HasCallStack =>
  a
unlines :: [String] -> String
until :: (a -> Bool) -> (a -> a) -> a -> a
unwords :: [String] -> String
unzip :: [(a, b)] -> ([a], [b])
unzip3 :: [(a, b, c)] -> ([a], [b], [c])
userError :: String -> IOError
words :: String -> [String]
writeFile :: FilePath -> String -> IO ()
zip :: [a] -> [b] -> [(a, b)]
zip3 :: [a] -> [b] -> [c] -> [(a, b, c)]
zipWith :: (a -> b -> c) -> [a] -> [b] -> [c]
zipWith3 :: (a -> b -> c -> d) -> [a] -> [b] -> [c] -> [d]
(||) :: Bool -> Bool -> Bool

[... everything works well ...] 2017-08-02 12:35:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&email=...&tool=...&query_key=1&webenv=NCID_1_167674051_130.14.18.34_9001_1501674632_1806773429_0MetA0_S_MegaStore_F_1&retmode=xml&retstart=664200&retmax=100> {'abstract': None, 'authors': [{'affiliation_info': None, 'first_name': u'I', 'last_name': u'Kimura'}, {'affiliation_info': None, 'first_name': u'T', 'last_name': u'Sugiyama'}, {'affiliation_info': None, 'first_name': u'Y', 'last_name': u'Ito'}], 'doi': None, 'journal': u'Proceedings of the Society for Experimental Biology and Medicine. Society for Experimental Biology and Medicine (New York, N.Y.)', 'keywords': [], 'publication_year': u'1967', 'scrape_session_id': 1, 'title': u'Papillomatous growth in sole from Wakasa Bay area.', 'url': u'https://www.ncbi.nlm.nih.gov/pubmed/6027520'} 2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Prepare item. 2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Check if article already exist. 2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Begin authors processing. 2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Get or create author. 2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Get or create author. 2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Begin keywords processing. 2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Get or create journal. 2017-08-02 12:35:55 [scraper.pipelines] DEBUG: Commit article. 2017-08-02 12:35:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&email=...&tool=...&query_key=1&webenv=NCID_1_167674051_130.14.18.34_9001_1501674632_1806773429_0MetA0_S_MegaStore_F_1&retmode=xml&retstart=664200&retmax=100> {'abstract': None, 'authors': [{'affiliation_info': None, 'first_name': u'V H', 'last_name': u'Donaldson'}, {'affiliation_info': None, 'first_name': u'O D', 'last_name': u'Ratnoff'}], 'doi': None, 'journal': u'Proceedings of the Society for Experimental Biology and Medicine. Society for Experimental Biology and Medicine (New York, N.Y.)', 'keywords': [], 'publication_year': u'1967', 'scrape_session_id': 1, 'title': u'Effect of some analogues of bradykinin upon vascular permeability.', 'url': u'https://www.ncbi.nlm.nih.gov/pubmed/6027514'} 2017-08-02 12:36:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 501 items/min) 2017-08-02 12:37:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 0 items/min) 2017-08-02 12:38:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 0 items/min) 2017-08-02 12:39:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 0 items/min) 2017-08-02 12:40:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 0 items/min) 2017-08-02 12:41:32 [scrapy.extensions.logstats] INFO: Crawled 506 pages (at 0 pages/min), scraped 48229 items (at 0 items/min) [... infinite ...] 之后,python进程已经运行但没有任何反应:

MySQL不再接收请求: MySQL no longer receives requests

以下是TelnetConsole的一些信息(在阻止之后):

2017-08-02 12:36:32

这是我的代码:

蜘蛛

>>> est()
Execution engine status

time()-engine.start_time                        : 8502.19854903
engine.has_capacity()                           : False
len(engine.downloader.active)                   : 16
engine.scraper.is_idle()                        : False
engine.spider.name                              : pubmed_spider
engine.spider_is_idle(engine.spider)            : False
engine.slot.closing                             : False
len(engine.slot.inprogress)                     : 16
len(engine.slot.scheduler.dqs or [])            : 0
len(engine.slot.scheduler.mqs)                  : 6605
len(engine.scraper.slot.queue)                  : 0
len(engine.scraper.slot.active)                 : 0
engine.scraper.slot.active_size                 : 0
engine.scraper.slot.itemproc_size               : 0
engine.scraper.slot.needs_backout()             : False

>>> stats.get_stats()
{
    'spider_name': 'Pubmed',
    'memusage/startup': 63430656,
    'scrape_session_id': 1,
    'log_count/INFO': 148,
    'downloader/response_count': 506,
    'downloader/response_bytes': 20485075,
    'item_dropped_count': 2182,
    'item_dropped_reasons_count/DropItem': 2182,
    'scrape_session_query': 'skin',
    'log_count/DEBUG': 582414,
    'scheduler/dequeued': 522,
    'log_count/WARNING': 2183,
    'request_depth_max': 1,
    'start_time': datetime.datetime(2017, 8, 2, 11, 50, 32, 57921),
    'downloader/request_method_count/GET': 522,
    'log_count/CRITICAL': 2,
    'memusage/max': 177164288,
    'downloader/request_bytes': 371843,
    'downloader/response_status_count/200': 506,
    'response_received_count': 506,
    'scheduler/enqueued/memory': 7127,
    'item_scraped_count': 48229,
    'scheduler/dequeued/memory': 522,
    'scheduler/enqueued': 7127,
    'downloader/request_count': 522
}

管道

class PubmedSpider(Spider):
    name = 'pubmed_spider'

    pubmed_email = '...'
    pubmed_tool = '...'

    allowed_domains = ['eutils.ncbi.nlm.nih.gov']

    base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils'
    base_params = 'db=pubmed&email=%s&tool=%s' % (pubmed_email, pubmed_tool)

    root_itertag = 'PubmedArticle'
    namespaces = ()

    webenv = None

    scrape_session_id = None
    source_total_count = 0

    ret_max = 100

    def __init__(self, query=None, source=None, system_session=None, publi_sci_session=None, *args, **kwargs):
        super(PubmedSpider, self).__init__(*args, **kwargs)
        self.query = query
        self.source = source
        self.system_session = system_session
        self.publi_sci_session = publi_sci_session

    def get_esearch_url(self):
        return '%s/esearch.fcgi?%s&term=%s&retmax=1&usehistory=y%s' % \
               (self.base_url,
                self.base_params,
                self.query.query,
                '&webenv=%s' % self.webenv if self.webenv else '')

    def get_efetch_url(self, query_key, retstart, retmax):
        return '%s/efetch.fcgi?%s&query_key=%s&webenv=%s&retmode=xml&retstart=%s&retmax=%s' %\
                           (self.base_url,
                            self.base_params,
                            query_key,
                            self.webenv,
                            retstart,
                            retmax)

    def get_last_crawl_session(self, query):
        return self.system_session\
            .query(func.sum(ScrapeSession.items_scraped + ScrapeSession.items_dropped).label("total_items_scraped"))\
            .join(ScrapeSession.queries)\
            .filter(ScrapeSession.source == self.source)\
            .filter(Query.id == query.id)\
            .first()

    def start_requests(self):
        return [Request(self.get_esearch_url(), callback=self.find_retmax)]

    def find_retmax(self, response):
        self.webenv = response.selector.xpath('/eSearchResult/WebEnv/text()').extract()[0]
        count = int(response.selector.xpath('/eSearchResult/Count/text()').extract()[0])
        query_key = response.selector.xpath('/eSearchResult/QueryKey/text()').extract()[0]
        last_session = self.get_last_crawl_session(self.query)
        self.source_total_count = count
        if not last_session.total_items_scraped:
            retmax = self.ret_max
        else:
            retmax = count - last_session.total_items_scraped
        if retmax is not None and retmax < 1:
            raise CloseSpider("No new publications for '%s'." % self.query.query)
        retstart = last_session.total_items_scraped
        offsets = [retstart]
        while retstart < count - self.ret_max:
            retstart += self.ret_max
            offsets.append(retstart)
        for offset in offsets:
            efetch_url = self.get_efetch_url(query_key, offset, self.ret_max)
            yield Request(efetch_url, callback=self.parse)

    def parse_article(self, response, node):
        authors_nodes = node.xpath('MedlineCitation/Article/AuthorList/Author')
        authors = self.parse_tag(response, authors_nodes, self.parse_author)
        loader = ItemLoader(item=PubmedArticleItem(), selector=node)
        loader.add_xpath('title', 'MedlineCitation/Article/ArticleTitle/text()')
        loader.add_xpath('title', 'MedlineCitation/Article/VernacularTitle/text()')
        loader.add_xpath('abstract', 'MedlineCitation/Article/Abstract/AbstractText/text()')
        loader.add_xpath('publication_year', 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year/text()')
        loader.add_xpath('publication_year', 'MedlineCitation/DateCreated/Year/text()')
        loader.add_xpath('publication_year', 'MedlineCitation/DateRevised/Year/text()')
        loader.add_xpath('doi', 'PubmedData/ArticleIdList/ArticleId[@IdType="doi"]/text()')
        loader.add_xpath('doi', 'MedlineCitation/Article/ELocationID[@EIdType="doi"]/text()')
        loader.add_xpath('journal', 'MedlineCitation/Article/Journal/Title/text()')
        loader.add_xpath('keywords', 'MedlineCitation/KeywordList/Keyword/text()')
        loader.add_value('url', 'https://www.ncbi.nlm.nih.gov/pubmed/')
        loader.add_xpath('url', 'MedlineCitation/PMID/text()')
        loader.add_value('authors', authors)
        loader.add_value('scrape_session_id', self.scrape_session_id)
        yield loader.load_item()

    def parse_author(self, response, node):
        loader = ItemLoader(item=PubmedAuthorItem(), selector=node)
        loader.add_xpath('last_name', 'LastName/text()')
        loader.add_xpath('first_name', 'ForeName/text()')
        loader.add_xpath('affiliation_info', 'AffiliationInfo/Affiliation/text()')
        return loader.load_item()

    def parse_tag(self, response, nodes, callback):
        results = []
        for selector in nodes:
            results.append(callback(response, selector))
        return results

    def parse_nodes(self, response, nodes, callback):
        for selector in nodes:
            for result_item in iterate_spider_output(callback(response, selector)):
                yield result_item

    def parse(self, response, itertag=root_itertag):
        nodes = self._iternodes(response, itertag)
        return self.parse_nodes(response, nodes, self.parse_article)

    def _iternodes(self, response, itertag):
        for node in xmliter(response, itertag):
            self._register_namespaces(node)
            yield node

    def _register_namespaces(self, selector):
        for (prefix, uri) in self.namespaces:
            selector.register_namespace(prefix, uri)

如果你们有任何想法,那真的很棒。 :)

提前谢谢!

0 个答案:

没有答案