Question

亲爱的社区，

我还有另一个关于不断变化的域名的问题。我正在废弃网页上的链接，我找到了这个链接：

https://www.spar.si/sl_SI/splet/spar-slovenija-twitter.html

我想保留在同一个域中，但当我的蜘蛛移动到此链接时，链接会变为：

有没有办法告诉蜘蛛不要去其他域？现在我看到解决方案来检查＆＃34;解析＆＃34;中的链接。如果域包含单词＆＃34; Twitter＆＃34;并且告诉它不要继续前进但我认为这不是优雅的解决方案。无论发生什么变化，我都想自动检测域名更改。你有什么想法吗？

提前谢谢。

我的代码：

#!/usr/bin/python
# -*- coding: utf-8 -*-
# encoding=UTF-8  
import scrapy, urlparse, time, sys
from scrapy.http import Request
from scrapy.utils.response import get_base_url
from urlparse import urlparse, urljoin
from vacancies.items import JobItem

#We need that in order to force Slovenian pages instead of English pages. It happened at "http://www.g-gmi.si/gmiweb/" that only English pages were found and no Slovenian.
from scrapy.conf import settings
settings.overrides['DEFAULT_REQUEST_HEADERS'] = {'Accept':'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8','Accept-Language':'sl',}
#settings.overrides['DEFAULT_REQUEST_HEADERS'] = {'Accept':'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8','Accept-Language':'sl','en':q=0.8,}

#start_time = time.time()
# We run the programme in the command line with this command: 

#      scrapy crawl jobs -o urls.csv -t csv --logfile log.txt

# We get two output files
#  1) urls.csv
#  2) log.txt

# Url whitelist.
with open("Q:/SIIT/JV_Marko_Boro/Detector/kljucne_besede/url_whitelist.txt", "r+") as kw:
    url_whitelist = kw.read().replace('\n', '').split(",")
url_whitelist = map(str.strip, url_whitelist)

# Tab whitelist.
# We need to replace character the same way as in detector.
with open("Q:/SIIT/JV_Marko_Boro/Detector/kljucne_besede/tab_whitelist.txt", "r+") as kw:
    tab_whitelist = kw.read().decode(sys.stdin.encoding).encode('utf-8')
tab_whitelist = tab_whitelist.replace('Ŕ', 'č')
tab_whitelist = tab_whitelist.replace('╚', 'č')
tab_whitelist = tab_whitelist.replace('Ő', 'š')
tab_whitelist = tab_whitelist.replace('Ü', 'š')
tab_whitelist = tab_whitelist.replace('Ä', 'ž')
tab_whitelist = tab_whitelist.replace('×', 'ž')
tab_whitelist = tab_whitelist.replace('\n', '').split(",")
tab_whitelist = map(str.strip, tab_whitelist)


#File to write unique links
unique = open("G:/myVE/vacancies/unique_urls.txt", "wb")


class JobSpider(scrapy.Spider):

    name = "jobs"

    #Test sample of SLO companies 
    start_urls = [

        "http://www.seltron.si/"
    ]
    print start_urls

    #Result of the programme is this list of job vacancies webpages.
    jobs_urls = []
    #I would like to see how many unique links we check on every page.
    unique_urls = []


    def parse(self, response):

        response.selector.remove_namespaces() 



        #We take all urls, they are marked by "href". These are either webpages on our website either new websites.
        urls = response.xpath('//@href').extract()

        #Base url.
        base_url = get_base_url(response) 


        #Loop through all urls on the webpage.
        for url in urls:

            url = url.strip()

            #Ignore ftp.
            if url.startswith("ftp"):

                continue

            #If url doesn't start with "http", it is relative url, and we add base url to get absolute url.
            # -- It is true, that we may get some strange urls, but it is fine for now.            
            if not (url.startswith("http")):


                url = urljoin(base_url,url)


        #self.f.write(str(url.encode('utf-8')).strip() + "\n")                

            #If url represents a picture, a document, a compression ... we ignore it. We might have to change that because some companies provide job vacancies information in PDF.
            if url.endswith((
                #images
                '.jpg', '.jpeg', '.png', '.gif', '.eps', '.ico', '.svg', '.tif', '.tiff',
                '.JPG', '.JPEG', '.PNG', '.GIF', '.EPS', '.ICO', '.SVG', '.TIF', '.TIFF',

                #documents
                '.xls', '.ppt', '.doc', '.xlsx', '.pptx', '.docx', '.txt', '.csv', '.pdf', '.pd', 
                '.XLS', '.PPT', '.DOC', '.XLSX', '.PPTX', '.DOCX', '.TXT', '.CSV', '.PDF', '.PD', 

                #music and video
                '.mp3', '.mp4', '.mpg', '.ai', '.avi', '.swf',
                '.MP3', '.MP4', '.MPG', '.AI', '.AVI', '.SWF',

                #compressions and other
                '.zip', '.rar', '.css', '.flv', '.php',
                '.ZIP', '.RAR', '.CSS', '.FLV', '.PHP',


            )):
                #self.f1.write("IMAGE " + str(url) + "\n")
                continue


            #If url includes characters like ?, %, &, # ... it is LIKELY NOT to be the one we are looking for and we ignore it. 
            #However in this case we exclude good urls like http://www.mdm.si/company#employment
            if any(x in url for x in ['%', '~']):

                continue



            #We need to save original url for xpath, in case we change it later (join it with base_url)
            url_xpath = url

            #If url doesn't start with "http", it is relative url, and we add base url to get absolute url.
            # -- It is true, that we may get some strange urls, but it is fine for now.            
            if not (url.startswith("http")):


                url = urljoin(base_url,url)


            #Counting unique links.
            if url not in self.unique_urls:
                self.unique_urls.append(url)
                unique.write(str(url) + "\n")

            #We don't want to go to other websites. We want to stay on our website, so we keep only urls with domain (netloc) of the company we are investigating.         
            if (urlparse(url).netloc == urlparse(base_url).netloc):


                #The main part. We look for webpages, whose urls include one of the employment words as strings.
                #We will check the tab of the url as well. This is additional filter, suggested by Dan Wu, to improve accuracy. 
                tabs = response.xpath('//a[@href="%s"]/text()' % url_xpath).extract()

                # Sometimes tabs can be just empty spaces like '\t' and '\n' so in this case we replace it with [].
                # That was the case when the spider didn't find this employment url: http://www.terme-krka.com/si/sl/o-termah-krka/o-podjetju-in-skupini-krka/zaposlitev/
                tabs = [tab.encode('utf-8') for tab in tabs]
                tabs = [tab.replace('\t', '') for tab in tabs]
                tabs = [tab.replace('\n', '') for tab in tabs]
                tab_empty = True
                for tab in tabs:
                    if tab != '':
                        tab_empty = False
                if tab_empty == True:
                    tabs = []


                # -- Instruction. 
                # -- Users in other languages, please insert employment words in your own language, like jobs, vacancies, career, employment ... --
                # Starting keyword_url is zero, then we add keywords as we find them in url. This is for tracking purposes.
                keyword_url = ''
                #if any(x in url for x in keywords):
                for keyword in url_whitelist:
                    if keyword in url:
                        keyword_url = keyword_url + keyword + ' '
                # If we find at least one keyword in url, we continue.
                if keyword_url != '':





                    #1. Tabs are empty.
                    if tabs == []:

                        #print "No text for url: " + str(url)


                        #We found url that includes one of the magic words and also the text includes a magic word. 
                        #We check url, if we have found it before. If it is new, we add it to the list "jobs_urls".
                        if url not in self.jobs_urls:


                            self.jobs_urls.append(url)
                            item = JobItem()
                            item["url"] = url
                            item["keyword_url"] = keyword_url
                            item["keyword_url_tab"] = ' '
                            item["keyword_tab"] = ' '
                            print url



                            #We return the item.
                            yield item



                    #2. There are texts, one or more.
                    else:


                        #For the same partial url several texts are possible.
                        for tab in tabs:


                            keyword_url_tab = ''
                            for key in tab_whitelist:                           
                                if key in tab:
                                    keyword_url_tab = keyword_url_tab + key + ' '

                            if keyword_url_tab != '':

                                # keyword_url_tab starts with keyword_url from before, because we want to remember keywords from both url and tab.
                                keyword_url_tab = 'URL ' + keyword_url + ' TAB ' + keyword_url_tab

                            #if any(x in text for x in keywords):

                                #We found url that includes one of the magic words and also the tab includes a magic word. 
                                #We check url, if we have found it before. If it is new, we add it to the list "jobs_urls".
                                if url not in self.jobs_urls:                             

                                    self.jobs_urls.append(url)
                                    item = JobItem()
                    item["url"] = url
                    item["keyword_url"] = ' '
                                    item["keyword_url_tab"] = keyword_url_tab
                                    item["keyword_tab"] = ' '
                                    print url

                                    #We return the item.
                                    yield item

                else:
                    for tab in tabs:
                        #print "TABS " + str(tabs)
                        #print "TAB " + str(type(tab))

                        keyword_tab = ''
                        for key in tab_whitelist:
                #print "KEY " + str(type(key))

                            if key in tab:
                                keyword_tab = keyword_tab + key + ' '
                        if keyword_tab != '':                           

                            if url not in self.jobs_urls:                             

                                self.jobs_urls.append(url)
                                item = JobItem()
                item["url"] = url
                item["keyword_url"] = ' '
                item["keyword_url_tab"] = ' '
                item["keyword_tab"] = keyword_tab
                print url
                                #We return the item.
                                yield item                  

        #We don't put "else" sentence because we want to further explore the employment webpage to find possible new employment webpages.
                #We keep looking for employment webpages, until we reach the DEPTH, that we have set in settings.py. 
                yield Request(url, callback = self.parse)

域名更改的原因是什么？

0 个答案: