匹配图片网址使用

时间:2018-06-02 08:51:09

标签: regex scrapy

我使用findall来匹配html的这部分



var images = ["https://file-comic-3.anyacg.co/images/b0/96/b096b8a41eb81a90ab798afb094e75670b12b646_205789_728_1043.jpg","https://file-comic-3.anyacg.co/images/e2/4a/e24af25afe98d7808e14f7c5b97c4c3138e41f14_186960_728_1042.jpg","https://file-comic-3.anyacg.co/images/1e/33/1e33b7ff4dcd1eee079cbc68bf4f27a898ee096d_176390_728_1041.jpg","https://file-comic-3.anyacg.co/images/8a/50/8a50c18cea0216eda3b6f4b09a2bdf704af02fbe_171808_728_1042.jpg","https://file-comic-3.anyacg.co/images/83/3f/833f31724f69d0b7f3de47680ea18df8013c5b1f_364517_1456_1038.jpg","https://file-comic-3.anyacg.co/images/4f/a6/4fa60bb27f33409099028f7793253da769a207e6_178272_728_1041.jpg","https://file-comic-3.anyacg.co/images/d0/b4/d0b408f4b6b3da7a77e5a11c2a764d81fb7f92a2_152151_728_1040.jpg","https://file-comic-3.anyacg.co/images/c1/b0/c1b099f553e69e09308e421335c0017e7b40c75b_156443_728_1040.jpg","https://file-comic-3.anyacg.co/images/f1/af/f1afbc8d5c124d34c101042a92d3062fc46efc56_156238_728_1041.jpg","https://file-comic-3.anyacg.co/images/b2/16/b21602016ba353cbc19514f557e68c27314cc1f4_194152_728_1042.jpg","https://file-comic-3.anyacg.co/images/65/b4/65b46aa1172e88c4ac91f2c49a2bbf355cc4c9ae_187313_728_1041.jpg","https://file-comic-3.anyacg.co/images/85/cb/85cbf6f030a256e6d34795614d5b9d608a5c34f8_195300_728_1040.jpg","https://file-comic-3.anyacg.co/images/4d/4b/4d4b3df52d6daddd48ed62a80a74eb9ecee04bed_181403_728_1040.jpg","https://file-comic-3.anyacg.co/images/0a/eb/0aeb8c448357608bbbbd73352137ce06cc1f768f_174864_728_1041.jpg","https://file-comic-3.anyacg.co/images/0a/de/0ade49bc09e3e3926604ebb3ab1ae713a3c72066_161014_728_1042.jpg","https://file-comic-3.anyacg.co/images/74/77/747782638059e12cd86e5977f93a31337a78da1c_176703_728_1041.jpg","https://file-comic-3.anyacg.co/images/99/3a/993a2a607aa29168371d4601b854728f7de7d8b9_165821_728_1040.jpg","https://file-comic-3.anyacg.co/images/db/f2/dbf2f8dab3bfede728e303e00eed1f653669006f_172858_728_1042.jpg","https://file-comic-3.anyacg.co/images/03/c9/03c937ff490ae8ddcd5fb2aa727bc5d8ce7717bb_167647_728_1041.jpg","https://file-comic-3.anyacg.co/images/18/f8/18f833c369dfbbf8ef31314e6bce6ca2b649aab3_175425_728_1040.jpg","https://file-comic-3.anyacg.co/images/e9/be/e9be9ac1276c35a109e10c904365f52aa862772a_181236_728_1041.jpg","https://file-comic-3.anyacg.co/images/d8/b5/d8b520852d186214944b20eadc8442b6e1b713b3_168540_728_1040.jpg","https://file-comic-3.anyacg.co/images/04/e8/04e8e451faf5a625b08fc9c94b299c253ac31eb9_185289_728_1042.jpg","https://file-comic-3.anyacg.co/images/3c/ad/3cad1d60c038c6816ae4e6e6e06871be9a440a04_166071_728_1042.jpg","https://file-comic-3.anyacg.co/images/9f/1a/9f1a858197fb5c5f9a3c74a51587e5cd24edabe5_176811_728_1041.jpg","https://file-comic-3.anyacg.co/images/02/a0/02a0629523ff47881d411751b02aaa354c1f4426_169330_728_1041.jpg","https://file-comic-3.anyacg.co/images/a4/3c/a43cfc3cc6d1b12c0f4191cd781f59c1c0d702ec_184585_728_1042.jpg","https://file-comic-3.anyacg.co/images/a9/04/a9047c0ecfdf7a8fa61e4ec0e95af0e0e3aff38f_149755_728_1040.jpg","https://file-comic-3.anyacg.co/images/47/0a/470a00e1631e10b8eb85db02bae9a59bdc9b20f5_145859_728_1042.jpg","https://file-comic-3.anyacg.co/images/8d/9e/8d9ecd51414f1fd646e38e567d7243424a53d3f3_179215_728_1040.jpg","https://file-comic-3.anyacg.co/images/c3/5c/c35c3fcaa6e35f4b6ba57b4472eafa90440e68f8_173301_728_1041.jpg","https://file-comic-3.anyacg.co/images/7a/82/7a82e3b4d82db558c60292e522e9e7dd247f7128_223630_728_1040.jpg","https://file-comic-3.anyacg.co/images/2c/30/2c3070663d9f0bad7cb8982a60535a406d82ca4d_70588_600_500.jpg"];
</script>
&#13;
&#13;
&#13;

import scrapy
import re
from mangapark.items import MangaparkItem
class MangaparkSpiderSpider(scrapy.Spider):
    name = 'mangapark'
    allowed_domains = ['mangapark.org','anyacg.com']
    start_urls = []
    for i in range(192,199):
        start_urls.append('https://mangapark.org/chapter/670561/chihayafuru-ch-'+str(i))
    def parse(self, response):
        html=response.body.decode('utf-8')
        pics=re.findall("https://file-comic-3.anyacg.co/images/+(.*?).jpg",html)
        print(pics[1]) 

但是结果只给了我带(。*?)

的内容
2018-06-02 04:11:02 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2018-06-02 04:11:02 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://mangapark.org/robots.txt> (referer: None)
2018-06-02 04:11:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://mangapark.org/chapter/670561/chihayafuru-ch-192> (referer: None)
b0/96/b096b8a41eb81a90ab798afb094e75670b12b646_205789_728_1043
2018-06-02 04:11:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://mangapark.org/chapter/670561/chihayafuru-ch-194> (referer: None)
b0/96/b096b8a41eb81a90ab798afb094e75670b12b646_205789_728_1043
2018-06-02 04:11:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://mangapark.org/chapter/670561/chihayafuru-ch-195> (referer: None)
2018-06-02 04:11:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://mangapark.org/chapter/670561/chihayafuru-ch-197> (referer: None)
2018-06-02 04:11:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://mangapark.org/chapter/670561/chihayafuru-ch-198> (referer: None)
2018-06-02 04:11:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://mangapark.org/chapter/670561/chihayafuru-ch-193> (referer: None)
2018-06-02 04:11:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://mangapark.org/chapter/670561/chihayafuru-ch-196> (referer: None)
**b0/96/b096b8a41eb81a90ab798afb094e75670b12b646_205789_728_1043
b0/96/b096b8a41eb81a90ab798afb094e75670b12b646_205789_728_1043
b0/96/b096b8a41eb81a90ab798afb094e75670b12b646_205789_728_1043
b0/96/b096b8a41eb81a90ab798afb094e75670b12b646_205789_728_1043
b0/96/b096b8a41eb81a90ab798afb094e75670b12b646_205789_728_1043**

如何通过&#39; https&#39;获取完整的网址?前缀和&#39; .jpg&#39;后缀?

0 个答案:

没有答案