使用多重处理进行文件处理-python

时间:2019-03-08 22:04:15

标签: python

我是Python的初学者,尝试添加几行代码以将json转换为csv并返回到json。有成千上万个文件(大小为300 MB)要转换和处理。使用当前程序(使用1个CPU),我无法使用服务器的16个CPU,并且需要建议以微调程序以进行多处理。下面是我的python 3.7版本的代码。

import json
import csv
import os

os.chdir('/stagingData/Scripts/test')

for JsonFile in os.listdir(os.getcwd()):
    PartialFileName = JsonFile.split('.')[0]

    j = 1
    with open(PartialFileName +".csv", 'w', newline='') as Output_File:

        with open(JsonFile) as fileHandle:
            i = 1
            for Line in fileHandle:
                try:
                    data = json.loads(Line, parse_float=str)
                except:
                    print("Can't load line {}".format(i))
                if i == 1:
                    header = data.keys()
                    output = csv.writer(Output_File)
                    output.writerow(header) #Writes header row
                i += 1
                output.writerow(data.values()) #writes values row
        j += 1

了解有关多处理逻辑的建议

2 个答案:

答案 0 :(得分:1)

由于文件很多,因此文档中最简单的多处理示例应该适用。 https://docs.python.org/3.4/library/multiprocessing.html?highlight=process

import XYZ from './xyz';
import {Edit} from './xyz';
import { pencilEditButton } from './Images';

const App = createStackNavigator(
{
    Home: {
        screen: My App,

        navigationOptions: ({ navigation }) => ({
            title: 'myApp',

            headerRight: (
            <View>
                <TouchableHighlight
                    onPress={() => EditMix()}
                    underlayColor="gray">
                    <View>
                        <Image source={pencilEditButton} style={styles.navigationButtonImage} />
                    </View>
                </TouchableHighlight>
            </View>
            ),
        }),
    },
}
);

export default createAppContainer(App);

您还可以尝试将export default class XYZ extends React.Component { constructor(props) { super(props); this.state = { editMode: false, }; }; render() { return ( <View style={styles.container}> { this.state.editMode === true ? <TouchableHighlight onPress={this._onXPressed} underlayColor="white"> <View style={[styles.flowRight, styles.controlButton]}> <Text style={styles.buttonText}>{'Edit Mode'}</Text> </View> </TouchableHighlight> : <TouchableHighlight onPress={this._onYPressed} underlayColor="white"> <View style={[styles.flowRight, styles.controlButton]}> <Text style={styles.buttonText}>{'Non Edit Mode'}</Text> </View> </TouchableHighlight> } </View> ); } } export function Edit() { editMode = true; alert('User wants to edit!'); } 替换为os.scandir(),在启动前不必返回所有目录条目。

答案 1 :(得分:1)

如果您要更有效地处理一个大文件,我建议以下内容:

  1. 将文件拆分为大块

  2. 创建一个处理每个块的过程

  3. (如有必要)将处理后的块合并回一个文件

类似这样的东西:

import csv
import json
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor

source_big_file = Path('/path/to/file')

def chunk_file_by_line(source_filepath: Path, chunk_size: int = 10_000):
    chunk_line_size = 10_000
    intermediate_file_handlers = {}
    last_chunk_filepath = None
    with source_big_file.open('r', encoding='utf8') as big:
        for line_number, line in big:
            group = line_number - (line_number % chunk_line_size)
            chunk_filename = f'{source_big_file.stem}.g{group}{source_big_file.suffix}'
            chunk_filepath = source_big_file.parent / chunk_filename
            if chunk_filepath not in intermediate_file_handlers:
                file_handler = chuck_filepath.open('w', encoding='utf8')
                intermediate_file_handlers[chunk_filepath] = file_handler
                if last_chunk_filepath:
                    last_file_hanlder = intermediate_file_handlers[last_chunk_filepath]
                    last_file_handler.close()
                    yield last_chunk_filepath
            else:
                file_handler = intermediate_file_handlers[chunk_filepath]
            file_handler.write(line)
            last_chunk_filepath  = chunk_filepath
    # output last one
    yield last_chunk_filepath


def json_to_csv(json_filepath: Path) -> Path:
    csv_filename = f'{json_filepath.stem}.csv'
    csv_filepath = json_filepath.parent / csv_filename
    with csv_filepath.open('w', encoding='utf8') as csv_out, json_filepath.open('r', encoding='utf8') as json_in:
        dwriter = csv.DictWriter(csv_out)
        headers_written = False
        for json_line in json_in:
            data = json.loads(json_line)
            if not headers_written:
                # create header record
                headers = {k:k for k in data.keys()}
                dwriter.writeline(headers)                
                headers_written = True
            dwriter.writeline(data)
    return csv_filepath


with ProcessPoolExecutor() as pool:
    futures = []
    for chunk_filepath in chuck_file_by_line(source_big_file):
        future = pool.submit(json_to_csv, chunk_filepath)
        futures.append(future)

    # wait for all to finish
    for future in futures:
        csv_filepath = future.result(timeout=None)  # waits until complete
        print(f'conversion complete> csv filepath: {csv_filepath}')