Question

我试图在pdf中生成unicode地图。在此pdf中，特定字体缺少unicode映射。我需要从页面中提取文本但没有这个地图它会给我奇怪的字符。使用itextsharp源代码中的代码，我编写了这段代码。无论如何，pageText1和pageText2似乎忽略了我定义的指标。

static void Main(string[] args)
    {
        var textPart = ReadFilePart("test.pdf", 1);
    }

    private static string ReadFilePart(string fileName, int pageNumber)
    {
        var pdfReader = new PdfReader(fileName);

        var pageText1 = PdfTextExtractor.GetTextFromPage(pdfReader, pageNumber, new SimpleTextExtractionStrategy());

        pdfReader = fontFix(pdfReader);

        var pageText2 = PdfTextExtractor.GetTextFromPage(pdfReader, pageNumber, new SimpleTextExtractionStrategy());

        pdfReader.Close();

        return pageText;
    }

    static PdfReader fontFix(PdfReader reader)
    {
        Dictionary<int, int[]> longTag = new Dictionary<int, int[]>
        {
            {1, new[] {32,32,32}},
            {2, new[] {33,33,33}},
            {3, new[] {34,34,34}},
            {4, new[] {35,35,35}},
            {5, new[] {36,36,36}},
            {6, new[] {37,37,37}},
            {7, new[] {38,38,38}},
            {8, new[] {39,39,39}},
            {9, new[] {40,40,40}},
            {10, new[] {41,41,41}},
            {11, new[] {42,42,42}},
            {12, new[] {43,43,43}},
            {13, new[] {44,4,44}},
            {14, new[] {45,45,45}},
            {15, new[] {46,46,46}},
            {16, new[] {47,47,47}},
            {17, new[] {48,48,48}},
            {18, new[] {49,49,49}},
            {19, new[] {50,50,50}},
            {20, new[] {51,51,51}},
            {21, new[] {52,52,52}},
            {22, new[] {53,53,53}},
            {23, new[] {54,54,54}},
            {24, new[] {55,55,55}},
            {25, new[] {56,56,56}},
            {26, new[] {57,57,57}},
            {27, new[] {58,58,58}},
            {28, new[] {59,59,59}},
            {29, new[] {60,60,60}},
            {30, new[] {61,61,61}},
            {31, new[] {62,62,62}},
            {32, new[] {63,63,63}},
            {33, new[] {64,64,64}},
            {34, new[] {65,65,65}},
            {35, new[] {66,66,66}},
            {36, new[] {67,67,67}},
            {37, new[] {68,68,68}},
            {38, new[] {69,69,69}},
            {39, new[] {70,70,70}},
            {40, new[] {71,71,71}},
            {41, new[] {72,72,72}},
            {42, new[] {73,73,73}},
            {43, new[] {74,74,74}},
            {44, new[] {75,75,75}},
        };

        int[][] metrics = new int[longTag.Count][];
        longTag.Values.CopyTo(metrics, 0);

        int documentPages = reader.NumberOfPages;
        for (int page = 1; page <= documentPages; page++)
        {
            PdfDictionary pageResources = reader.GetPageResources(page);
            if (pageResources == null)
                continue;
            PdfDictionary pageFonts = pageResources.GetAsDict(PdfName.FONT);
            if (pageFonts == null || pageFonts.Size == 0)
                continue;

            foreach (PdfName key in pageFonts.Keys)
            {
                PdfIndirectReference fontReference = pageFonts.GetAsIndirectObject(key);
                if (fontReference == null)
                    continue;

                DocumentFont font = (DocumentFont)BaseFont.CreateFont((PRIndirectReference)fontReference);
                if (font == null)
                    continue;

                var unicodeMap = GetToUnicode(metrics);

                font.FontDictionary.Put(PdfName.TOUNICODE, unicodeMap);
            }
        }
        return reader;
    }

    public static PdfStream GetToUnicode(Object[] metrics)
    {
        if (metrics.Length == 0)
            return null;
        StringBuilder buf = new StringBuilder(
            "/CIDInit /ProcSet findresource begin\n" +
            "12 dict begin\n" +
            "begincmap\n" +
            "/CIDSystemInfo\n" +
            "<< /Registry (TTX+0)\n" +
            "/Ordering (T42UV)\n" +
            "/Supplement 0\n" +
            ">> def\n" +
            "/CMapName /TTX+0 def\n" +
            "/CMapType 2 def\n" +
            "1 begincodespacerange\n" +
            "<0000><FFFF>\n" +
            "endcodespacerange\n");
        int size = 0;
        for (int k = 0; k < metrics.Length; ++k)
        {
            if (size == 0)
            {
                if (k != 0)
                {
                    buf.Append("endbfrange\n");
                }
                size = Math.Min(100, metrics.Length - k);
                buf.Append(size).Append(" beginbfrange\n");
            }
            --size;
            int[] metric = (int[])metrics[k];
            string fromTo = ToHex(metric[0]);
            buf.Append(fromTo).Append(fromTo).Append(ToHex(metric[2])).Append('\n');
        }
        buf.Append(
            "endbfrange\n" +
            "endcmap\n" +
            "CMapName currentdict /CMap defineresource pop\n" +
            "end end\n");
        string s = buf.ToString();
        PdfStream stream = new PdfStream(PdfEncodings.ConvertToBytes(s, null));
        stream.FlateCompress(-1);
        return stream;
    }
    internal static string ToHex(int n)
    {
        if (n < 0x10000)
            return "<" + System.Convert.ToString(n, 16).PadLeft(4, '0') + ">";
        n -= 0x10000;
        int high = (n / 0x400) + 0xd800;
        int low = (n % 0x400) + 0xdc00;
        return "[<" + System.Convert.ToString(high, 16).PadLeft(4, '0') + System.Convert.ToString(low, 16).PadLeft(4, '0') + ">]";
    }

使用iTextSharp在PDF中生成unicode映射

0 个答案: