如何识别给定文件中的所有非UTF8字符?
我们需要用C#编写它,并能够在SSIS环境中执行它。 执行完之后,我们需要找出并检查所有错误的情况,并最终将它们的行号输入到输入文件中。
假设: -文件格式正确(在我们的示例中), -新行有CR LF
答案 0 :(得分:0)
经过一些研究,我们收集了一些提示:
这是我们学到的东西:
SO:我们需要改进utf8checker类的版本,以便继续扫描整个文件,并且在第一次出现错误时没有完成。 完成扫描后,代码将生成一个日志文件,列出所有非utf8事件。
以下代码在我们的情况下有效。它在SSIS脚本任务中执行,并从输入参数中读取文件名。
也许可以进一步改善。
/*
Microsoft SQL Server Integration Services Script Task
Write scripts using Microsoft Visual C# 2008.
The ScriptMain is the entry point class of the script.
*/
using System;
using System.Data;
using Microsoft.SqlServer.Dts.Runtime;
using System.Windows.Forms;
using System.IO;
using System.Text;
using System.Linq;
using System.Collections.Generic;
namespace ST_5c3d8ec1340c4ab9bbb71cb975760e42.csproj
{
[System.AddIn.AddIn("ScriptMain", Version = "1.0", Publisher = "", Description = "")]
public partial class ScriptMain : Microsoft.SqlServer.Dts.Tasks.ScriptTask.VSTARTScriptObjectModelBase
{
public void Main()
{
String fileToCheck, logFileName;
bool OK_UTF8;
IUtf8Checker fileCheckerUtf8 = new Utf8Checker();
List<IErrorUtf8Checker> errorsList;
System.IO.StreamWriter logFile;
try
{
fileToCheck = Dts.Variables["User::InputFile"].Value.ToString();
logFileName = fileToCheck + "_utf8check.log";
if (File.Exists(fileToCheck))
{
OK_UTF8 = fileCheckerUtf8.Check(fileToCheck);
if (OK_UTF8 == false)
{
errorsList = fileCheckerUtf8.GetErrorList();
logFile = new StreamWriter(logFileName);
int i = 0;
foreach (ErrorUtf8Checker e in errorsList)
{
logFile.WriteLine(++i + ") " + e.ToString());
}
logFile.Close();
}
}
//exit always with success. It writes a log file if any warning occurs
Dts.TaskResult = (int)ScriptResults.Success;
}
catch (DecoderFallbackException eUTF)
{
Console.Write(eUTF.ToString());
Dts.TaskResult = (int)ScriptResults.Failure;
}
catch (Exception e)
{
Console.Write(e.ToString());
Dts.TaskResult = (int)ScriptResults.Failure;
}
}
#region VSTA generated code
enum ScriptResults
{
Success = Microsoft.SqlServer.Dts.Runtime.DTSExecResult.Success,
Failure = Microsoft.SqlServer.Dts.Runtime.DTSExecResult.Failure
};
#endregion
/**
* PrintOnSSISConsole
* Used to print a string s into the immediate console of SSIS
*/
public void PrintOnSSISConsole(String s)
{
System.Diagnostics.Debug.WriteLine(s);
}
/// <summary>
/// Interface for checking for utf8.
/// </summary>
public interface IUtf8Checker
{
/// <summary>
/// Check if file is utf8 encoded.
/// </summary>
/// <param name="fileName"></param>
/// <returns>true if utf8 encoded, otherwise false.</returns>
bool Check(string fileName);
/// <summary>
/// Check if stream is utf8 encoded.
/// </summary>
/// <param name="stream"></param>
/// <returns>true if utf8 encoded, otherwise false.</returns>
bool IsUtf8(Stream stream);
/// <summary>
/// Return a list of found errors of type of IErrorUtf8Checker
/// </summary>
/// <returns>List of errors found through the Check metod</returns>
List<IErrorUtf8Checker> GetErrorList();
}
public interface IErrorUtf8Checker
{
}
/// <summary>
/// http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335
///
/// http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html
///
/// http://www.unicode.org/versions/corrigendum1.html
///
/// http://www.ietf.org/rfc/rfc2279.txt
///
/// </summary>
public class Utf8Checker : IUtf8Checker
{
// newLineArray = used to understand the new line sequence
private static byte[] newLineArray = new byte[2] { 13, 10 };
private int line = 1;
private byte[] lineArray = new byte[2] { 0, 0 };
// used to keep trak of number of errors found into the file
private List<IErrorUtf8Checker> errorsList;
public Utf8Checker()
{
this.errorsList = new List<IErrorUtf8Checker>();
}
public int getNumberOfErrors()
{
return errorsList.Count();
}
public bool Check(string fileName)
{
using (BufferedStream fstream = new BufferedStream(File.OpenRead(fileName)))
{
return this.IsUtf8(fstream);
}
}
public int getLine()
{
return line;
}
public List<IErrorUtf8Checker> GetErrorList()
{
return errorsList;
}
/// <summary>
/// Check if stream is utf8 encoded.
/// Notice: stream is read completely in memory!
/// </summary>
/// <param name="stream">Stream to read from.</param>
/// <returns>True if the whole stream is utf8 encoded.</returns>
public bool IsUtf8(Stream stream)
{
int count = 4 * 1024;
byte[] buffer;
int read;
while (true)
{
buffer = new byte[count];
stream.Seek(0, SeekOrigin.Begin);
read = stream.Read(buffer, 0, count);
if (read < count)
{
break;
}
buffer = null;
count *= 2;
}
return IsUtf8(buffer, read);
}
/// <summary>
///
/// </summary>
/// <param name="buffer"></param>
/// <param name="length"></param>
/// <returns></returns>
public bool IsUtf8(byte[] buffer, int length)
{
int position = 0;
int bytes = 0;
bool ret = true;
while (position < length)
{
if (!IsValid(buffer, position, length, ref bytes))
{
ret = false;
errorsList.Add(new ErrorUtf8Checker(getLine(), buffer[position]));
}
position += bytes;
}
return ret;
}
/// <summary>
///
/// </summary>
/// <param name="buffer"></param>
/// <param name="position"></param>
/// <param name="length"></param>
/// <param name="bytes"></param>
/// <returns></returns>
public bool IsValid(byte[] buffer, int position, int length, ref int bytes)
{
if (length > buffer.Length)
{
throw new ArgumentException("Invalid length");
}
if (position > length - 1)
{
bytes = 0;
return true;
}
byte ch = buffer[position];
char ctest = (char)ch; // for debug only
this.detectNewLine(ch);
if (ch <= 0x7F)
{
bytes = 1;
return true;
}
if (ch >= 0xc2 && ch <= 0xdf)
{
if (position >= length - 2)
{
bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 2;
return true;
}
if (ch == 0xe0)
{
if (position >= length - 3)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0xa0 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 3;
return true;
}
if (ch >= 0xe1 && ch <= 0xef)
{
if (position >= length - 3)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 3;
return true;
}
if (ch == 0xf0)
{
if (position >= length - 4)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x90 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf ||
buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 4;
return true;
}
if (ch == 0xf4)
{
if (position >= length - 4)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0x8f ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf ||
buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 4;
return true;
}
if (ch >= 0xf1 && ch <= 0xf3)
{
if (position >= length - 4)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf ||
buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 4;
return true;
}
return false;
}
private void detectNewLine(byte ch)
{
// looking for second char for new line (char 13 feed)
if (this.lineArray[0] == newLineArray[0])
{
if (ch == newLineArray[1])
{
// found new line
this.lineArray[1] = ch;
line++;
// reset work array: lineArray
this.lineArray[1] = 0;
}
// we have to reset work array because CR(13)LF(10) must be in sequence
this.lineArray[0] = 0;
}
else
{
// found first character (char 10 return)
if (ch == newLineArray[0])
{
this.lineArray[0] = ch;
}
}
}
}
public class ErrorUtf8Checker : IErrorUtf8Checker
{
private int line;
private byte ch;
public ErrorUtf8Checker(int line, byte character)
{
this.line = line;
this.ch = character;
}
public ErrorUtf8Checker(int line)
{
this.line = line;
}
public override string ToString()
{
string s;
try
{
if (ch > 0)
{
s = "line: " + line + " code: " + ch + ", char: " + (char)ch;
}
else
{
s = "line: " + line;
}
return s;
}
catch (Exception e)
{
Console.Write(e.ToString());
return base.ToString();
}
}
}
}
}
举个例子:
Hello world test UTF8
err 1: °
text ok line 3
err 2: ò
errs 3: à è § °
end file
发布的代码将创建一个新文件,其中包含:
1) line: 2 code: 176, char: °
2) line: 4 code: 242, char: ò
3) line: 5 code: 224, char: à
4) line: 5 code: 232, char: è
5) line: 5 code: 167, char: §
6) line: 5 code: 176, char: °
答案 1 :(得分:0)
当您将文件加载到字节数组中,然后尝试将其加载到字符串时,无效的UTF8字符将被替换为? (问号)。您的代码应如下所示:
byte[] data = File.ReadAllBytes(pathToYourFile);
string result = Encoding.UTF8.GetString(data);
接下来,您可以执行例如清洁步骤?