我想将平面文件 test.txt 转换为平面文件 test-output.txt 。
<小时/> 示例输入:test.txt
COD/ID:37
PRJ/NAME: Josephy Murphy
PRJ/EMAIL: jmurphy@email.com
PRJ/DESCRIPTION: test37, test37, test37 ...
COD/ID:38
PRJ/NAME: Paul Newman
PRJ/EMAIL: pnewman@email.com
PRJ/DESCRIPTION: test38, test38, test38 ...
.
.
示例输出:test-output.txt(没有标签的管道分隔)
37|Josephy Murphy|jmurphy@email.com|test37, test37, test37 ...
38|Paul Newman|pnewman@email.com|test38, test38, test38 ...
.
.
屏幕截图的链接:
test.txt
test-output.txt
我想将此文件导入SQL Server。但是文件test.txt(15,000,000行)不是默认导入分隔符。
我将使用SSIS导入数据,但必须是CSV格式或带分隔符的其他格式。
我想过使用REGEX或SSIS脚本组件。我知道带有格式化文本的SSIS文件的导入过程,但是这个文件没有格式化。
答案 0 :(得分:2)
以Regex为例:
class Program
{
private static Regex reg = new Regex(@"COD/ID:\s(?<id>\d+)\r\nPRJ/NAME:\s(?<name>.+?)\r\nPRJ/EMAIL:\s(?<email>\S+?@\S+?\.\S+?)\r\nPRJ/DESCRIPTION:\s(?<description>.*?)(?:\n|$)");
static void Main(string[] args)
{
string original = @"
COD/ID: 37
PRJ/NAME: Josephy Murphy
PRJ/EMAIL: jmurphy@email.com
PRJ/DESCRIPTION: test37, test37, test37 ...
COD/ID: 38
PRJ/NAME: Paul Newman
PRJ/EMAIL: pnewman@email.com
PRJ/DESCRIPTION: test38, test38, test38 ...";
string result = string.Join(
"\n",
reg.Matches(original)
.Cast<Match>()
.Select(m => string.Format("{0}|{1}|{2}|{3}",m.Groups["id"].Value,m.Groups["name"].Value,m.Groups["email"].Value,m.Groups["description"].Value)));
Console.WriteLine(result);
}
}
修改强>
class Program
{
private static Regex reg = new Regex(@"COD/ID:\s(?<id>\d+)\r\nPRJ/NAME:\s(?<name>.+?)\r\nPRJ/EMAIL:\s(?<email>\S+?@\S+?\.\S+?)\r\nPRJ/DESCRIPTION:\s(?<description>.*?)\r\n");
static void Main(string[] args)
{
StringBuilder intermediateStringBuilder = new StringBuilder();
using (StreamReader reader = new StreamReader(@"YourInputPath.txt",true))
{
using (StreamWriter writer = new StreamWriter("YourOutputPath.txt"))
{
while (reader.Peek() > 0)
{
string line = reader.ReadLine();
if (!string.IsNullOrWhiteSpace(line))
{
intermediateStringBuilder.AppendLine(line);
}
else
{
WriteToFile(intermediateStringBuilder, writer);
}
}
WriteToFile(intermediateStringBuilder,writer);
}
}
}
private static void WriteToFile(StringBuilder intermediateStringBuilder, StreamWriter writer)
{
Match m = reg.Match(intermediateStringBuilder.ToString());
writer.WriteLine("{0}|{1}|{2}|{3}", m.Groups["id"].Value, m.Groups["name"].Value, m.Groups["email"].Value, m.Groups["description"].Value);
intermediateStringBuilder.Clear();
}
}
答案 1 :(得分:0)
在这种情况下,您可以在没有正则表达式的情况下执行此操作,因为上下文已知。
使用此:
public class EntryN
{
public string id { get; set; }
public string name { get; set; }
public string email { get; set; }
public string description { get; set; }
public EntryN()
{
this.id = this.name = this.email = this.description = string.Empty;
}
public string ToLine()
{
return this.id + "|" + this.name + "|" + this.email + "|" + this.description;
}
}
var entries = new List<EntryN>();
using (var sl = new StreamReader(@"c:\YOURPATH.txt", true))
{
var entry = new EntryN();
var line = string.Empty;
while ((line = sl.ReadLine()) != null)
{
if (line.StartsWith("COD/ID:"))
entry.id = line.Substring(8).Trim();
else if (line.StartsWith("PRJ/NAME:"))
entry.name = line.Substring(10).Trim();
else if (line.StartsWith("PRJ/EMAIL"))
entry.email = line.Substring(11).Trim();
else if (line.StartsWith("PRJ/DESCRIPTION"))
entry.description = line.Substring(17).Trim();
else if (line.Trim() == string.Empty)
{
entries.Add(entry);
entry = new EntryN();
}
}
if (!entry.Equals(new EntryN()))
entries.Add(entry);
sl.Close();
}
var resulted = entries.Select(p => p.ToLine()).ToList();
输出:
编辑:另一个没有单独类的代码,可直接编写而不创建其他字符串:
var id = string.Empty;
var name = string.Empty;
var email = string.Empty;
var description = string.Empty;
using (var sw = new StreamWriter(@"OUTPUT_FILE", false, Encoding.UTF8))
{
using (var sl = new StreamReader(@"INPUT_FILE", true))
{
var line = string.Empty;
while ((line = sl.ReadLine()) != null)
{
if (line.StartsWith("COD/ID:"))
id = line.Substring(8).Trim();
else if (line.StartsWith("PRJ/NAME:"))
name = line.Substring(10).Trim();
else if (line.StartsWith("PRJ/EMAIL"))
email = line.Substring(11).Trim();
else if (line.StartsWith("PRJ/DESCRIPTION"))
description = line.Substring(17).Trim();
else if (line.Trim() == string.Empty)
{
sw.WriteLine(string.Format("{0}|{1}|{2}|{3}", id, name, email, description));
id = name = email = description = string.Empty;
}
}
if (!new string[] {id, name, email, description}.Any(p => string.IsNullOrWhiteSpace(p)))
sw.WriteLine(string.Format("{0}|{1}|{2}|{3}", id, name, email, description));
sl.Close();
}
sw.Close();
}