Question

我有一个程序，几乎完成了，它只是错过了我正在努力的最后一部分。我需要从很多网页中删除（如果您需要查看一个示例，您需要访问此网站http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA并填写案例编号为03732，案例年份为16，并单击第一次提交。） contentHolder div上的表并将它们写入csv文件，得到如下内容：案例状态，状态可用，案例编号，PA / 03732/16开发地点：40 .... 对于网页上的所有表格和大量网页都是这样的。我写了一些代码尝试这样做，但它不起作用，当我运行它时，它在csv文件上生成此输出：https://gyazo.com/6557ac08ad5613a24b5432bfd9e4f2e6它甚至没有完成所有页面，因为它返回错误中间：

Traceback (most recent call last):
  File "C:\PROJECT\pdfs\converterpluspa.py", line 93, in <module>
    csv.writer(f).writerow(answer)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u201c' in position 0: ordinal not in range(128)

到目前为止，Heres是我的程序的完整代码：

import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
import requests
from robobrowser import RoboBrowser
import codecs

def rename_files():
    file_list = os.listdir(r"C:\\PROJECT\\pdfs")
    print(file_list)
    saved_path = os.getcwd()
    print('Current working directory is '+saved_path)
    os.chdir(r'C:\\PROJECT\\pdfs')
    for file_name in file_list:
        os.rename(file_name, file_name.translate(None, " "))
    os.chdir(saved_path)
rename_files()

def run(command):
     if platform.system() != 'Windows':
         args = shlex.split(command)
    else:
        args = command
    s = subprocess.Popen(args,
                          stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, errors = s.communicate()
    return s.returncode == 0, output, errors

# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
    print "%s is not a directory" % base_directory
    exit(1)
 # Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
    print "Could not find %s" % bin_path
    exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
    for file_name in file_name_list:
        # If this is not a PDF file
        if not file_name.endswith('.pdf'):
            # Skip it
            continue
        file_path = os.path.join(dir_path, file_name)
        # Convert your PDF to HTML here
        args = (bin_path, file_name, file_path)
        success, output, errors = run("python %s -o %s.html %s " %args)
        if not success:
            print "Could not convert %s to HTML" % file_path
            print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
    writer = csv.writer(f)
    for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
        for file_name in file_name_list:
            if not file_name.endswith('.html'):
                continue
            with open(file_name) as markup:
                soup = BeautifulSoup(markup.read())
                text = soup.get_text()
                match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
                print(match)
                writer.writerow(match)
                 for item in match:
                    data = item.split('/')
                    case_number = data[0]
                    case_year = data[1]

                browser = RoboBrowser()
                browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
                form = browser.get_forms()[0]  # Get the first form on the page
                form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
                form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year

                browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])

                # Use BeautifulSoup to parse this data
                answer = browser.response.text
                print(answer)
                soup = BeautifulSoup(answer)
                #print soup.prettify()
                status = soup.select('#Table1')
                print (status)
                with codecs.open('file_output.csv', 'a', encoding ='utf-8') as f:
                  for tag in soup.select("#Table1"):
                    csv.writer(f).writerow(answer)

编辑：我试图将最后一行更改为csv.writer(f).writerow(answer.encode("utf-8"))但它没有用，它打印了另一条错误消息：

Traceback (most recent call last):
  File "C:\PROJECT\pdfs\converterpluspa.py", line 93, in <module>
    csv.writer(f).writerow(answer.encode("utf-8"))
  File "C:\Python27\lib\codecs.py", line 706, in write
    return self.writer.write(data)
  File "C:\Python27\lib\codecs.py", line 369, in write
    data, consumed = self.encode(object, self.errors)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 25496: ordinal not in range(128)

最终的csv文件没有任何内容。

Answer 1

您需要使用UTF-8对输出进行编码。将最后一行更改为：

csv.writer(f, encoding="utf-8").writerow(answer.encode("utf-8"))

同时将导入从import csv更改为import unicodecsv as csv

如何使用BeautifulSoup解析多个表并将它们保存到csv文件

1 个答案: