删除所有unicode u'来自python中的csv文件(2.7)

时间:2017-01-08 03:26:38

标签: python python-2.7 csv unicode

    import os
    import json
    import csv
    import re


    subdir =  "./json_files/" #'/home/varun/Desktop/pyfile'

    def jsontocsv():
        with open ('test.csv', 'w') as outfile:
            fieldnames = ['name', 'private', 'version', 'dependencies', 'scripts', 'devDependencies']
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

        for file in os.listdir(subdir):
            file_path = os.path.join(subdir, file)

            with open(file_path, 'r') as json_file:
                parsed_json = json.load(json_file)

                with open ('test.csv', 'a') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(parsed_json.values())


    def cleanUnicode():
        with open ('data.csv', 'w') as outfile:
            fieldnames = ['name', 'private', 'version', 'dependencies', 'scripts', 'devDependencies']
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

        with open('test.csv', 'r') as csvfile:
            reader = csv.DictReader(csvfile, delimiter=',')
            rows = list(reader)
            for row in rows[1:]:
                row = str(row)
                row = re.sub(r'u', r'', row)
                print(row)

                # with open ('data.csv', 'a') as csvfile:
                #     fieldnames = ['name', 'private', 'version', 'dependencies', 'scripts', 'devDependencies']
                #     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                #     writer.writerow(row)

        # os.remove('test.csv')


    if __name__ == '__main__':
        jsontocsv()
        cleanUnicode()
        print("Scripts finished running all json files parsed to csv")

我正在从多个json文件读取到单个csv文件中,将数据放在单个csv文件中,但它对于每个嵌套值都有“u”。如何删除这些并仅保留我想要的数据?

示例输入:

{
      "version": "0.1.0",
      "devDependencies": {
        "react-scripts": "0.6.1"
      },
      "dependencies": {
        "crossfilter": "^1.3.12",
        "d3": "^4.2.6",
        "d3-scale": "^1.0.3",
        "dc": "^2.0.0-beta.32",
        "immutable": "^3.8.1",
        "jszip": "^3.1.2",
        "react": "^15.3.2",
        "react-addons-transition-group": "^15.3.2",
        "react-dom": "^15.3.2",
        "shifty": "^1.5.2",
        "wolfy87-eventemitter": "^5.1.0"
      },
      "scripts": {
        "start": "react-scripts start",
        "build": "react-scripts build",
        "test": "react-scripts test --env=jsdom",
        "eject": "react-scripts eject"
      }
    }

输出:

version,dependencies,scripts,devDependencies
0.1.0,"{u'wolfy87-eventemitter': u'^5.1.0', u'shifty': u'^1.5.2', u'react-addons-transition-group': u'^15.3.2', u'react-dom': u'^15.3.2', u'dc': u'^2.0.0-beta.32', u'ccbooleananalysis': u'^1.0.0', u'react': u'^15.3.2', u'jszip': u'^3.1.2', u'crossfilter': u'^1.3.12', u'ccnetviz': u'^1.0.8', u'immutable': u'^3.8.1', u'd3': u'^4.2.6', u'd3-scale': u'^1.0.3'}","{u'test': u'react-scripts test --env=jsdom', u'start': u'react-scripts start', u'build': u'react-scripts build', u'eject': u'react-scripts eject'}",{u'react-scripts': u'0.6.1'}

希望所有人都被替换

1 个答案:

答案 0 :(得分:0)

我不确定您为什么要将字典作为字符串写入CSV文件,但无论如何......

这是获取没有u Unicode前缀的字符串的一种方法。我们处理通过加载JSON数据创建的字典,将所有键和值字符串编码为UTF-8;任何字典值都是递归处理的。

这适用于纯ASCII数据。但是,任何超出7位ASCII范围的数据都将编码为\x转义序列。但这不是一个真正的问题。当您阅读CSV文件时,您可能希望将这些字符串转换回正确的字典。您可以使用ast.literal_eval,它会很乐意接受\x转义序列。

要验证此代码是否处理Unicode,我在测试数据中添加了一个额外的项目。 “devDependencies”字典现在包含一个新项目:“unicode-test”,其值为“™©”。在我的代码的最后一部分,我读回了CSV数据,将“devDependencies”字符串转换回dict,并打印该dict的'unicode-test'字段,以验证它是否被转换回正确的Unicode字符串。 / p> BTW,我这些天大多使用Python 3.6,而我最新版本的Python 2是2.6.6。它的csv模块没有DictWriter.writeheader方法,因此我使用另一种方法来编写标题行。

import json
import csv
import ast

csvname = 'test.csv' 

src = '''\
{
    "version": "0.1.0",
    "devDependencies": {
        "unicode-test": "™©",
        "react-scripts": "0.6.1"
    },
    "dependencies": {
        "crossfilter": "^1.3.12",
        "d3": "^4.2.6",
        "d3-scale": "^1.0.3",
        "dc": "^2.0.0-beta.32",
        "immutable": "^3.8.1",
        "jszip": "^3.1.2",
        "react": "^15.3.2",
        "react-addons-transition-group": "^15.3.2",
        "react-dom": "^15.3.2",
        "shifty": "^1.5.2",
        "wolfy87-eventemitter": "^5.1.0"
    },
    "scripts": {
        "start": "react-scripts start",
        "build": "react-scripts build",
        "test": "react-scripts test --env=jsdom",
        "eject": "react-scripts eject"
    }
}
'''

data = json.loads(src)

encoding = 'utf8'

def encode_dict(d):
    newd = {}
    for k, v in d.iteritems():
        if isinstance(v, dict):
            v = encode_dict(v)
        else:
            v = v.encode(encoding)
        newd[k.encode(encoding)] = v
    return newd

clean_data = encode_dict(data) 
print clean_data
print '- ' * 20

fieldnames = ['version', 'dependencies', 'scripts', 'devDependencies']

with open(csvname, 'wb') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    #writer.writeheader()
    # Write header, the old-fashioned way
    writer.writerow(dict((s, s) for s in fieldnames))
    writer.writerow(clean_data)

# Verify
with open(csvname, 'rb') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        print row
        s = row['devDependencies']
        d = ast.literal_eval(s)
        print d['unicode-test']

<强>输出

{'devDependencies': {'unicode-test': '\xe2\x84\xa2\xc2\xa9', 'react-scripts': '0.6.1'}, 'version': '0.1.0', 'dependencies': {'wolfy87-eventemitter': '^5.1.0', 'react-addons-transition-group': '^15.3.2', 'react-dom': '^15.3.2', 'd3-scale': '^1.0.3', 'dc': '^2.0.0-beta.32', 'jszip': '^3.1.2', 'react': '^15.3.2', 'crossfilter': '^1.3.12', 'shifty': '^1.5.2', 'd3': '^4.2.6', 'immutable': '^3.8.1'}, 'scripts': {'test': 'react-scripts test --env=jsdom', 'start': 'react-scripts start', 'build': 'react-scripts build', 'eject': 'react-scripts eject'}}
- - - - - - - - - - - - - - - - - - - - 
{'devDependencies': "{'unicode-test': '\\xe2\\x84\\xa2\\xc2\\xa9', 'react-scripts': '0.6.1'}", 'version': '0.1.0', 'dependencies': "{'wolfy87-eventemitter': '^5.1.0', 'react-addons-transition-group': '^15.3.2', 'react-dom': '^15.3.2', 'd3-scale': '^1.0.3', 'dc': '^2.0.0-beta.32', 'jszip': '^3.1.2', 'react': '^15.3.2', 'crossfilter': '^1.3.12', 'shifty': '^1.5.2', 'd3': '^4.2.6', 'immutable': '^3.8.1'}", 'scripts': "{'test': 'react-scripts test --env=jsdom', 'start': 'react-scripts start', 'build': 'react-scripts build', 'eject': 'react-scripts eject'}"}
™©

test.csv的内容

version,dependencies,scripts,devDependencies
0.1.0,"{'wolfy87-eventemitter': '^5.1.0', 'react-addons-transition-group': '^15.3.2', 'react-dom': '^15.3.2', 'd3-scale': '^1.0.3', 'dc': '^2.0.0-beta.32', 'jszip': '^3.1.2', 'react': '^15.3.2', 'crossfilter': '^1.3.12', 'shifty': '^1.5.2', 'd3': '^4.2.6', 'immutable': '^3.8.1'}","{'test': 'react-scripts test --env=jsdom', 'start': 'react-scripts start', 'build': 'react-scripts build', 'eject': 'react-scripts eject'}","{'unicode-test': '\xe2\x84\xa2\xc2\xa9', 'react-scripts': '0.6.1'}"