Question

这就是我正在做的事情

import csv
output = open('output.txt' , 'wb')


# this functions return the min for num.txt
def get_min(num):
    return int(open('%s.txt' % num, 'r+').readlines()[0])

# temporary variables
last_line = ''
input_list = []

#iterate over input.txt in sort the input in a list of tuples 
for i, line in enumerate(open('input.txt', 'r+').readlines()): 
    if i%2 == 0: 
        last_line = line
    else:
        input_list.append((last_line, line))
filtered = [(header, data[:get_min(header[-2])] + '\n' ) for (header, data) in input_list]
[output.write(''.join(data)) for data in filtered]
output.close()

在此代码中input.txt就是这样的

>012|013|0|3|M
AFDSFASDFASDFA
>005|5|67|0|6
ACCTCTGACC
>029|032|4|5|S
GGCAGGGAGCAGGCCTGTA

和num.txt就是这样的

M 4
P 10

我希望在上面input.txt中查看num.txt的值，查看与num.txt相同的最后一列，并根据该值剪切其字符< / p>

我认为我的代码中的错误是它只接受整数文本文件，它也应该接受包含字母的文件

Answer 1

你可以这样做;

import re

min_count = 4 # this variable will contain that count integer from where to start removing
str_to_match = 'EOG6CC67M' # this variable will contain the filename you read

input = '' # The file input (input.txt) will go in here

counter = 0

def callback_f(e):
    global min_count
    global counter
    counter += 1
    # Check your input
    print(str(counter) + ' >>> ' + e.group())

    # Only replace the value with nothing (remove it) after a certain count
    if counter > min_count:
        return '' # replace with nothing

result = re.sub(r''+str_to_match, callback_f, input)

通过这种策略，您可以使用全局计数器进行计数，并且不需要进行具有复杂结构的硬线循环。

<强>更新

更详细的文件访问版本;

import os
import re

def callback_f(e):
    global counter
    counter += 1
    # Check your input
    print(str(counter) + ' >>> ' + e.group())


# Fetch all hash-file names and their content (count)
num_files = os.listdir('./num_files')
numbers = {}

for file in num_files:
    if file[0] != '.':
        file_c = open('./num_files/' + file)
        file_c = file_c.read()

        numbers[file.split('.')[0]] = file_c


# Now the CSV files
csv_files = os.listdir('./csv_files')

for file in csv_files:
    if file[0] != '.':
        for hash_name, min_count in numbers.iteritems():
            file_c = open('./csv_files/' + file)
            file_c = file_c.read()

            counter = 0

            result = re.sub(r''+hash_name, callback_f, file_c)

            # Write the replaced content back to the file here

考虑目录/文件结构;

 + Projects
   + Project_folder
     + csv_files
       - input1.csv
       - input2.csv
       ~ etc.
     + num_files
       - EOG6CC67M.txt
       - EOG62JQZP.txt
       ~ etc.
     - python_file.py

CSV文件包含您在原始问题中陈述的大块文本。
Num文件包含其中包含Integer的哈希文件

此脚本会发生什么;

收集所有哈希文件（在字典中）及其内部计数
循环浏览所有CSV文件
通过收集的每个CSV文件的数字
替换/删除（基于您在callback_f()中执行的操作）某个计数后的哈希值
写回输出（这是脚本中的最后一条注释，包含file.write()功能）

Answer 2

经过与OP的长时间聊天后完全修订的版本;

import os
import re

# Fetch all hashes and counts
file_c = open('num.txt')
file_c = file_c.read()
lines = re.findall(r'\w+\.txt \d+', file_c)
numbers = {}

for line in lines:
    line_split = line.split('.txt ')
    hash_name = line_split[0]
    count = line_split[1]

    numbers[hash_name] = count

#print(numbers)

# The input file
file_i = open('input.txt')
file_i = file_i.read()

for hash_name, count in numbers.iteritems():
    regex = '(' + hash_name.strip() + ')'
    result = re.findall(r'>.*\|(' + regex + ')(.*?)>', file_i, re.S)

    if len(result) > 0:
        data_original = result[0][2]
        stripped_data = result[0][2][int(count):]

        file_i = file_i.replace(data_original, '\n' + stripped_data)

        #print(data_original)
        #print(stripped_data)

#print(file_i)


# Write the input file to new input_new.txt
f = open('input_new.txt', 'wt')
f.write(file_i)

根据文件中的值切割字符值

2 个答案: