假设我有一个大文件file.txt,它有大约300,000的数据。我想根据某个关键位置拆分它。请参阅下面的file.txt:
Line 1: U0001;POUNDS;**CAN**;1234
Line 2: U0001;POUNDS;**USA**;1234
Line 3: U0001;POUNDS;**CAN**;1234
Line 100000; U0001;POUNDS;**CAN**;1234
这些地点仅限于10-15个不同的国家。我需要在一个特定文件中分隔特定国家/地区的每个记录。如何在Python中执行此任务
感谢您的帮助
答案 0 :(得分:1)
这将以非常低的内存开销运行,因为它在读取时写入每一行。
算法:
代码:
with open('file.txt', 'r') as infile:
try:
outfiles = {}
for line in infile:
country = line.split(';')[2].strip('*')
if country not in outfiles:
outfiles[country] = open(country + '.txt', 'w')
outfiles[country].write(line)
finally:
for outfile in outfiles.values():
outfile.close()
答案 1 :(得分:0)
require './app/models/question_data'
describe QuestionData do
subject(:question_data_instance) { described_class.new(data) }
let(:question_data_class) { described_class }
let(:CSV) { double(:CSV, foreach: nil) }
let(:questions) { [] }
let(:file) { double(:file) }
let(:data) do
[{
time_limit: 10,
text: "Who was the legendary Benedictine monk who invented champagne?",
correct_answer: "Dom Perignon",
option_2: "Ansgar",
option_3: "Willibrord"
},
{
time_limit: 12,
text: "Name the largest freshwater lake in the world?",
correct_answer: "Lake Superior",
option_2: "Lake Victoria",
option_3: "Lake Huron"
}]
end
describe '#questions' do
it "has an array of question data from CSV" do
expect(question_data_instance.questions).to eq(data)
end
end
describe '.parse_csv' do
it "parses CSV data into hash data" do
expect(CSV).to receive(:foreach).with(file)
question_data_class.parse_csv(file, questions)
end
end
describe '.load_questions' do
it "calls '.parse_csv' method" do
expect(question_data_class).to receive(:parse_csv).with(file, questions)
question_data_class.load_questions(file, questions)
end
it "creates an instance with CSV data" do
allow(question_data_class).to receive(:load_questions).with(file, questions).and_return(question_data_instance)
allow(question_data_class).to receive(:new).with(data).and_return(question_data_instance)
expect(question_data_class).to receive(:new).with(data).and_return(question_data_instance)
question_data_class.load_questions(file, questions)
end
end
describe '.new' do
it "creates a new instance with CSV data" do
expect(question_data_class).to receive(:new).with(data).and_return(question_data_instance)
question_data_class.new(data)
end
end
end
将按您的项目进行分组! 希望它有所帮助!
答案 2 :(得分:-1)
看起来你拥有的是csv
文件。 csv
代表以逗号分隔的值,但任何使用不同分隔符的文件(在本例中为分号;
)都可以视为csv
文件。
我们将使用python模块csv
来读取文件,然后为每个国家/地区写一个文件
import csv
from collections import defaultdict
d = defaultdict(list)
with open('file.txt', 'rb') as f:
r = csv.reader(f, delimiter=';')
for line in r:
d[l[2]].append(l)
for country in d:
with open('{}.txt'.format(country), 'wb') as outfile:
w = csv.writer(outfile, delimiter=';')
for line in d[country]:
w.writerow(line)
答案 3 :(得分:-1)
# the formatting-function for the filename used for saving
outputFileName = "{}.txt".format
# alternative:
##import time
##outputFileName = lambda loc: "{}_{}.txt".format(loc, time.asciitime())
#make a dictionary indexed by location, the contained item is new content of the file for the location
sortedByLocation = {}
f = open("file.txt", "r")
#iterate each line and look at the column for the location
for l in f.readlines():
line = l.split(';')
#the third field (indices begin with 0) is the location-abbreviation
# make the string lower, cause on some filesystems the file with upper chars gets overwritten with only the elements with lower characters, while python differs between the upper and lower
location = line[2].lower().strip()
#get previous lines of the location and store it back
tmp = sortedByLocation.get(location, "")
sortedByLocation[location]=tmp+l.strip()+'\n'
f.close()
#save file for each location
for location, text in sortedByLocation.items():
with open(outputFileName(location) as f:
f.write(text)