浏览器删除xml文件中的换行符

时间:2018-04-17 17:22:08

标签: xml python-3.x lxml

我编写了一个python 3脚本,输出格式化的XML RSS输出。但是当我在Chrome中打开输出xml文件时,我看到Newlines不存在。这是我的代码:

import requests
import csv
import re
import math
from babel.numbers import format_decimal
from lxml import html, etree
from rfeed import *

class Scraper:

    def __init__(self,url):
        self.session = requests
        self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept-Language': 'en-US,en;q=0.9',
                        'Host': 'www.fpds.gov',
                        'Upgrade-Insecure-Requests': '1',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
        self.url = url

    def make_requests(self):
        r = self.session.get(self.url,headers=self.headers)
        with open('sample.html','w',newline='',encoding='utf-8')as export:
            export.write(r.text.replace('ns1:','ns1').lower())
        return r.text

    def amount_correction(self,source):
        return '$'+format_decimal(math.trunc(float(source)), locale='en_US')

    def open_existing(self):
        return list(set([each.strip() for each in open('existing.csv')]))

    def parse_xml(self):
        Items = []
        Existing_list = self.open_existing(); Existing_list_1 = []
        source = self.make_requests()
        sections = re.findall('<entry>(.+?)</entry>',source,re.DOTALL)
        for section in sections:
            x = lambda x: html.fromstring(section.replace('ns1:','ns1')).xpath(x)[0].strip()
            y = lambda y: re.findall(y,section,re.DOTALL)
            #fundingRequestingOfficeID = x("//ns1fundingrequestingofficeid//text()")
            fundingRequestingOfficeNAME = x("//ns1fundingrequestingofficeid//@name")
            #placeOfPerformanceZIPCode = x('//ns1placeofperformancezipcode//text()')
            placeOfPerformanceCITY = x('//ns1placeofperformancezipcode//@city')
            modified = x('//modified//text()')
            vendorName = x('//ns1vendorname//text()')
            obligatedAmount = self.amount_correction(x('//ns1obligatedamount//text()'))
            href = x('//link[@rel="alternate"]//@href')+"&s=FPDS&templateName=1.4.4&indexName=awardfull&sortBy=SIGNED_DATE&desc=Y"
            effectiveDate = x('//ns1effectivedate//text()').split(' ')[0]
            fundingRequestingOfficeID = x('//ns1fundingrequestingofficeid//text()')
            fundingRequestingOfficename = x('//ns1fundingrequestingofficeid//@name')
            descriptionOfContractRequirement = x('//ns1descriptionofcontractrequirement//text()')
            contractingofficeagencyid = x('//ns1contractingofficeagencyid//@name')
            #contractActionType = x('//ns1agencyid//text()')
            #typeOfContractPricing = x('//ns1typeofcontractpricing//text()')
            #extentCompeted = x('//ns1extentcompeted//text()')
            extentdescription = x('//ns1extentcompeted//@description')
            #productOrServiceCode = x('//ns1productorservicecode//text()')
            productOrServiceCodedescription = x('//ns1productorservicecode//@description')
            #principalNAICSCode = x('//ns1principalnaicscode//text()')
            principalNAICSCodedescription = x('//ns1principalnaicscode//@description')
            currentCompletionDate = x('//ns1currentcompletiondate//text()').split(' ')[0]
            ultimateCompletionDate = x('//ns1ultimatecompletiondate//text()').split(' ')[0]
            totalBaseAndExercisedOptionsValue = self.amount_correction(x('//ns1totalbaseandexercisedoptionsvalue//text()'))
            totalBaseAndAllOptionsValue = self.amount_correction(x('//ns1totalbaseandalloptionsvalue//text()'))
            inherentlyGovernmentalFunction = x('//ns1inherentlygovernmentalfunction//text()')
            inherentlyGovernmentalFunctiondescription = x('//ns1inherentlygovernmentalfunction//@description')
            GFE_GFP = x('//ns1gfe-gfp//text()')
            try:
                multiYearContract = x('//ns1multiyearcontract//text()')
            except:
                multiYearContract = None
            #serviceContractAct = x('//ns1servicecontractact//text()')
            serviceContractActdescription = x('//ns1servicecontractact//@description')
            vendorName = x('//ns1vendorname//text()')
            vendorLocationcity = x('//ns1vendorlocation//ns1city//text()')
            vendorLocationstate = x('//ns1vendorlocation//ns1state//text()')
            annualRevenue = self.amount_correction(x('//ns1annualrevenue//text()'))
            numberOfEmployees = x('//ns1numberofemployees//text()')
            vendorLocationphone = x('//ns1vendorlocation//ns1phoneno//text()')
            isForProfitOrganization = x('//ns1isforprofitorganization//text()')
            contractingOfficerBusinessSizeDetermination = x('//ns1contractingofficerbusinesssizedetermination//@description')
            Title = "{} in {} – {} to {}".format(fundingRequestingOfficeNAME,placeOfPerformanceCITY,obligatedAmount,vendorName)
            Link = "{}".format(href)
            Description = """Effective {}, {} has obligated {} for “{}” to be performed in {}.
                            \n\n
                            This is a {} ({}) for {} under the NAICS Code {}.
                            \n\n                            
                            The current and ultimate completion dates are {} and {} respectively.
                            The total value of base and exercised options is {} out of {}.
                            \n\n
                            Inherently Governmental? {} {}
                            \n
                            Government-furnished? {}
                            \n
                            Multiyear? {}
                            \n
                            Service Contact Act? {}
                            \n\n
                            The vendor is {} in {}, {}.
                            \n
                            Revenue: {}
                            \n
                            Employees: {}
                            \n
                            Phone: {}
                            \n\n
                            For Profit? {}
                            \n
                            Status: {}
                            """.format(effectiveDate,fundingRequestingOfficeNAME,obligatedAmount,descriptionOfContractRequirement,placeOfPerformanceCITY,
                                     contractingofficeagencyid,extentdescription,productOrServiceCodedescription,principalNAICSCodedescription,
                                       currentCompletionDate,ultimateCompletionDate,totalBaseAndExercisedOptionsValue,totalBaseAndAllOptionsValue,
                                       inherentlyGovernmentalFunction,inherentlyGovernmentalFunctiondescription,GFE_GFP,multiYearContract,
                                       serviceContractActdescription,vendorName,vendorLocationcity,vendorLocationstate,annualRevenue,numberOfEmployees,
                                       vendorLocationphone,isForProfitOrganization,contractingOfficerBusinessSizeDetermination)
            #Description = Description.replace('&lt;','<').replace('&gt;','>')
            to_check = x('//ns1piid//text()')+x('//ns1signeddate//text()').split(' ')[0].strip()
            if to_check not in Existing_list:
                Items.append(Item(title = Title,link = Link,description = Description))
                Existing_list_1.append(to_check)

        if Items != []:        
            feed = Feed(title = "RSS FEEDS",
                        link = "http://167.99.192.145/fpds_rss_feed.xml",
                        description = "Customized RSS Feed",
                        generator = "Shekhar Samanta",
                        items = Items)
            with open("fpds_rss_feed.xml","w",encoding="utf-8")as export:
                export.write(feed.rss())
        with open('existing.csv','a',newline='')as export1:
            writer = csv.writer(export1)
            for Existing_one in Existing_list_1:
                if Existing_one not in Existing_list:
                    writer.writerow([Existing_one])

fpds = Scraper('https://www.fpds.gov/ezsearch/fpdsportal?s=FPDSNG.COM&indexName=awardfull&templateName=1.4.4&q=OBLIGATED_AMOUNT%3A%5B50000%2C%29+AND+PRINCIPAL_NAICS_CODE%3A%28541618+OR+541690+OR+541820+OR+541910+OR+541990+OR+561110+OR+561499+OR+561611+OR+561990+OR+921190+OR+922190+OR+923110+OR+923130+OR+928110+OR+928120%29+AND+AGENCY_CODE%3A%280559+OR+1100+OR+1145+OR+1153+OR+1204+OR+1301+OR+1544+OR+1549+OR+1550+OR+1900+OR+3400+OR+7003+OR+7009+OR+7022+OR+7100+OR+7200+OR+7505+OR+7523+OR+8000+OR+8300+OR+8900+OR+9543+OR+9577%29+PRODUCT_OR_SERVICE_CODE%3A%28+AJ96+OR+B506+OR+B507+OR+B522+OR+B544+OR+B548+OR+B549+OR+B550+OR+B551+OR+R405+OR+R406+OR+R407+OR+R408+OR+R409+OR+R412+OR+R419+OR+R422+OR+R423+OR+R426+OR+R499+OR+R699+OR+R706+OR+R707+OR+R708+OR+R799%29+AND+SIGNED_DATE%3A%5B2018%2F03%2F01%2C%29+AND+POP_STATE_NAME%3A%28%22VIRGINIA%22+OR+%22MARYLAND%22+OR+%22DISTRICT+OF+COLUMBIA%22%29+-%22DOMESTIC+AWARDEES+%28UNDISCLOSED%29%22+-%22FOREIGN+AWARDEES%22&rss=1&feed=atom0.3')
source = fpds.parse_xml()

我尝试使用不同的HTML标签让换行符可以被Chrome接受,但它不起作用。它的Description标记具有换行符\#39; \ n但它不起作用

您需要在同一目录中使用空白CSV文件&#34; existing.csv&#34;

0 个答案:

没有答案