根据位置将大文本文件拆分为小文本

时间:2017-02-24 13:20:55

标签: python jython-2.7

假设我有一个大文件file.txt,它有大约300,000的数据。我想根据某个关键位置拆分它。请参阅下面的file.txt:

Line 1: U0001;POUNDS;**CAN**;1234
Line 2: U0001;POUNDS;**USA**;1234
Line 3: U0001;POUNDS;**CAN**;1234
Line 100000; U0001;POUNDS;**CAN**;1234

这些地点仅限于10-15个不同的国家。我需要在一个特定文件中分隔特定国家/地区的每个记录。如何在Python中执行此任务

感谢您的帮助

4 个答案:

答案 0 :(得分:1)

这将以非常低的内存开销运行,因为它在读取时写入每一行。

算法:

  • 打开输入文件
  • 从输入文件中读取一行
  • 从国家/地区获取国家/地区
  • 如果新国家/地区然后打开国家/地区的文件
  • 将该行写入国家/地区的文件
  • 循环,如果有更多行
  • 关闭文件

代码:

with open('file.txt', 'r') as infile:
    try:
        outfiles = {}
        for line in infile:
            country = line.split(';')[2].strip('*')
            if country not in outfiles:
                outfiles[country] = open(country + '.txt', 'w')
            outfiles[country].write(line)
    finally:
        for outfile in outfiles.values():
            outfile.close()

答案 1 :(得分:0)

require './app/models/question_data'

describe QuestionData do

  subject(:question_data_instance) { described_class.new(data) }
  let(:question_data_class) { described_class }
  let(:CSV) { double(:CSV, foreach: nil) }
  let(:questions) { [] }
  let(:file) { double(:file) }
  let(:data) do
    [{
      time_limit: 10,
      text: "Who was the legendary Benedictine monk who invented champagne?",
      correct_answer: "Dom Perignon",
      option_2: "Ansgar",
      option_3: "Willibrord"
      },
      {
        time_limit: 12,
        text: "Name the largest freshwater lake in the world?",
        correct_answer: "Lake Superior",
        option_2: "Lake Victoria",
        option_3: "Lake Huron"
      }]
  end

  describe '#questions' do
    it "has an array of question data from CSV" do
      expect(question_data_instance.questions).to eq(data)
    end
  end

  describe '.parse_csv' do
    it "parses CSV data into hash data" do
      expect(CSV).to receive(:foreach).with(file)
      question_data_class.parse_csv(file, questions)
    end
  end

  describe '.load_questions' do
    it "calls '.parse_csv' method" do
      expect(question_data_class).to receive(:parse_csv).with(file, questions)
      question_data_class.load_questions(file, questions)
    end

    it "creates an instance with CSV data" do
      allow(question_data_class).to receive(:load_questions).with(file, questions).and_return(question_data_instance)
      allow(question_data_class).to receive(:new).with(data).and_return(question_data_instance)
      expect(question_data_class).to receive(:new).with(data).and_return(question_data_instance)
      question_data_class.load_questions(file, questions)
    end
  end

  describe '.new' do
    it "creates a new instance with CSV data" do
      expect(question_data_class).to receive(:new).with(data).and_return(question_data_instance)
      question_data_class.new(data)
    end
  end

end

将按您的项目进行分组! 希望它有所帮助!

答案 2 :(得分:-1)

看起来你拥有的是csv文件。 csv代表以逗号分隔的值,但任何使用不同分隔符的文件(在本例中为分号;)都可以视为csv文件。

我们将使用python模块csv来读取文件,然后为每个国家/地区写一个文件

import csv 
from collections import defaultdict

d = defaultdict(list)
with open('file.txt', 'rb') as f:
    r = csv.reader(f, delimiter=';')
    for line in r:
        d[l[2]].append(l)

for country in d:
    with open('{}.txt'.format(country), 'wb') as outfile:
        w = csv.writer(outfile, delimiter=';')
        for line in d[country]:
            w.writerow(line)

答案 3 :(得分:-1)

# the formatting-function for the filename used for saving
outputFileName = "{}.txt".format
# alternative:
##import time
##outputFileName = lambda loc: "{}_{}.txt".format(loc, time.asciitime())

#make a dictionary indexed by location, the contained item is new content of the file for the location
sortedByLocation = {}
f = open("file.txt", "r")

#iterate each line and look at the column for the location
for l in f.readlines():
    line = l.split(';')
    #the third field (indices begin with 0) is the location-abbreviation
    # make the string lower, cause on some filesystems the file with upper chars gets overwritten with only the elements with lower characters, while python differs between the upper and lower
    location = line[2].lower().strip()
    #get previous lines of the location and store it back
    tmp = sortedByLocation.get(location, "")
    sortedByLocation[location]=tmp+l.strip()+'\n'

f.close()

#save file for each location
for location, text in sortedByLocation.items():
    with open(outputFileName(location) as f:
        f.write(text)