使用C#将制表符分隔的文本文件中的HTML读取为字符串

时间:2019-01-03 04:18:03

标签: c# html decode encode

我正在尝试从制表符分隔的文本文件中读取HTML,并创建一个HTML文件,然后将其转换为pdf。当我尝试阅读文本文件时,出现了'和其他一些奇怪的字符。这是我的代码

        var lines = System.IO.File.ReadAllLines(@"C:\temp\Laura.txt", Encoding.GetEncoding("Windows-1255"));
        var csv = lines.Select(x =>
        {
            var parts = x.Split('\t');
            return new Articles()
            {
                id = parts[0].Trim(),
                name = parts[1].Trim(),
                body = parts[2].Trim(),
                //body = WebUtility.HtmlDecode(parts[2].Trim()),
                //body = HttpUtility.HtmlEncode(parts[2].Trim()),
                //body = WebUtility.HtmlEncode(parts[2].Trim()),
                //body = SecurityElement.Escape(parts[2].Trim()),
            };
        }).ToList();
       foreach (var item in csv)
        {
            string id = item.name;
            string filename = item.name + ".html";
            string body = item.body;
            string path = @"c:\temp\" + filename;

            // This text is added only once to the file.
            if (!File.Exists(path))
            {
                // Create a file to write to.
                File.WriteAllText(path, body);
                Microsoft.Office.Interop.Word.Application ap = new Microsoft.Office.Interop.Word.Application();
                Document document = ap.Documents.Open(path);

                object oFalse = false;
                object oTrue = true;
                object OutputFileName = Path.Combine(
                Path.GetDirectoryName(path),
                Path.GetFileNameWithoutExtension(path) + ".pdf");
                object missing = System.Reflection.Missing.Value;
                document.PrintOut(
                oTrue,          // Background
                oFalse,         // Append
                ref missing,    // Range
                OutputFileName, // OutputFileName
                ref missing,    // From
                ref missing,    // To
                ref missing,    // Item
                ref missing,    // Copies
                ref missing,    // Pages
                ref missing,    // PageType
                ref missing,    // PrintToFile
                ref missing,    // Collate
                ref missing,    // ActivePrinterMacGX
                ref missing,    // ManualDuplexPrint
                ref missing,    // PrintZoomColumn
                ref missing,    // PrintZoomRow
                ref missing,    // PrintZoomPaperWidth
                ref missing     // PrintZoomPaperHeight
                );
            }
        }

我尝试了注释掉的代码,但是似乎没有任何效果。

1 个答案:

答案 0 :(得分:0)

尝试

var lines = System.IO.File.ReadAllLines(@"C:\temp\Laura.txt",  Encoding.GetEncoding("Windows-1255"));
var csv = lines.Select(x =>
{
    var parts = x.Split('\t');
    return new Articles()
    {
        id = parts[0].Trim(),
        name = parts[1].Trim(),
        body = parts[2].Trim(),
    };
}).ToList();

尝试此wdExportFormatPDF

var lines = System.IO.File.ReadAllText(@"1.html", Encoding.GetEncoding("Windows-1255"));
var path = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"2.html");
var app = new Microsoft.Office.Interop.Word.Application();
var doc = app.Documents.Open(path, false);
var OutputFileName = Path.Combine(
                          Path.GetDirectoryName(path),
                          Path.GetFileNameWithoutExtension(path)+
                          ".pdf");
doc.ExportAsFixedFormat(OutputFileName, WdExportFormat.wdExportFormatPDF);

完整代码

static void connvert()
{
    var lines =
        File.
        ReadAllLines
        (@"C:\temp\Laura.txt",
            Encoding.GetEncoding("Windows-1255")
        );

    var csv = lines.Select(x =>
    {
        var parts = x.Split('\t');
        return new Articles()
        {
            id = parts[0].Trim(),
            name = parts[1].Trim(),
            body = parts[2].Trim(),
        };
    }).ToList();



    foreach (var item in csv)
    {
        string id = item.name;
        string filename = item.name + ".html";
        string body = item.body;
        string path = @"c:\temp\" + filename;

        // This text is added only once to the file.
        if (!File.Exists(path))
        {
            // Create a file to write to.
            //  File.WriteAllText(path, body);
            File.WriteAllText(path, body, Encoding.Unicode); // try this
            //   File.WriteAllText(path, body, Encoding.Encoding.GetEncoding("Windows-1255"));// then this

            var app = new Application();
            var doc = app.Documents.Open(path, false);
            var OutputFileName =
                Path.Combine(
                    Path.GetDirectoryName(path),
                    Path.GetFileNameWithoutExtension(path) +
                    ".pdf");
            doc.ExportAsFixedFormat
                (OutputFileName,
                    WdExportFormat.wdExportFormatPDF
                );
        }
    }
}