我正在尝试读取大型数据文件。我在C#中有以下代码,它可以获取满足要求的行并将它们保存到单独的文件中。整个代码大约需要20分钟才能运行。
*miniArray
我在python中写了一些东西,可以使用chunksize实现相同的功能,但运行需要25个小时。我想知道是否有一种方法可以根据上面的C#代码在python中编写类似的代码。
我的python代码如下。将行转换为DataFrame的命令占用的时间最多。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
namespace ConsoleApplication1 {
public class CSVFileConverter {
public string FilePath { get; set; }
public string OutputDirectory { get; set; }
public int numStartLine { get; set; }
public string strFeature { get; set; }
public string[] colNames {get;set;}
private string strcolNames;
private int maxNumOfFiles;
private int m_numofLines = -1;
public int numofLines {
get {
return this.m_numofLines;
}
}
private int colIndex;
private Dictionary<int, StreamWriter> dictSW = new Dictionary<int, StreamWriter>();
public CSVFileConverter(string filepath, string output, string feature, int numstartline) {
this.FilePath = filepath;
this.OutputDirectory = output;
this.strFeature = feature;
this.numStartLine = numstartline;
this.GetColNames();
this.Converter();
this.CloseStreams();
}
private void GetColNames() {
try {
var strline = ReadCSVLines(this.FilePath).First();
strcolNames = strline;
colNames = strline.Split(',');
if (!colNames.Contains(this.strFeature)) {
throw new Exception(string.Format("The File doesn't have this specified feature: {0}", this.strFeature));
} else {
this.colIndex = Array.IndexOf(colNames, this.strFeature);
}
}catch (Exception ex){
Console.WriteLine(ex);
Environment.Exit(0);
}
}
private void OpenFiles() {
for (int i = 0; i <= this.maxNumOfFiles; i++) {
}
}
private void Converter() {
foreach (var strline in ReadCSVLines(this.FilePath)) {
m_numofLines++;
if (m_numofLines < this.numStartLine) continue; //skip lines;
string[] strsplit = strline.Split(',');
int id = 0;
try {
id = Convert.ToInt32(strsplit[this.colIndex]);
} catch {
Console.WriteLine("Line {0} is invalid input: {1} = {2}.",m_numofLines, this.strFeature, strsplit[this.colIndex]);
continue;
}
if (dictSW.ContainsKey(id)) {
var sw = dictSW[id];
sw.WriteLine(strline);
sw.Flush();
} else {
string filename = OutputDirectory + "file"+ id.ToString() + ".csv";
StreamWriter sw = new StreamWriter(filename);
dictSW.Add(id, sw);
sw.WriteLine(this.strcolNames);
sw.WriteLine(strline);
sw.Flush();
}
if (id < 0 || id > 100) {
Console.WriteLine("Line {0} is invalid input: {1} = {2}.", m_numofLines, this.strFeature, id);
continue;
}
if ((m_numofLines % 10000) == 0) {
Console.WriteLine("numLines = {0}", m_numofLines);
}
}
}
private void CloseStreams() {
foreach (var sw in dictSW.Values) {
sw.Dispose();
}
}
private static IEnumerable<string> ReadCSVLines(string filepath) {
using (StreamReader sr = new StreamReader(filepath)) {
while (!sr.EndOfStream) {
string strline = sr.ReadLine();
yield return strline;
}
}
}
}
}
namespace ConsoleApplication1 {
class Program {
static void Main(string[] args) {
//Dictionary<int, List<double>> dict = new Dictionary<int, List<double>> ();
string csvpath = @"D:\data\train.csv";
string folderpath = @"D:\data\processedfiles\";
CSVFileConverter csv_conv = new CSVFileConverter(csvpath, folderpath, "a", 1);
Console.ReadKey();
}
}
}
谢谢。