我试图在pdf中生成unicode地图。 在此pdf中,特定字体缺少unicode映射。我需要从页面中提取文本但没有这个地图它会给我奇怪的字符。 使用itextsharp源代码中的代码,我编写了这段代码。 无论如何,pageText1和pageText2似乎忽略了我定义的指标。
static void Main(string[] args)
{
var textPart = ReadFilePart("test.pdf", 1);
}
private static string ReadFilePart(string fileName, int pageNumber)
{
var pdfReader = new PdfReader(fileName);
var pageText1 = PdfTextExtractor.GetTextFromPage(pdfReader, pageNumber, new SimpleTextExtractionStrategy());
pdfReader = fontFix(pdfReader);
var pageText2 = PdfTextExtractor.GetTextFromPage(pdfReader, pageNumber, new SimpleTextExtractionStrategy());
pdfReader.Close();
return pageText;
}
static PdfReader fontFix(PdfReader reader)
{
Dictionary<int, int[]> longTag = new Dictionary<int, int[]>
{
{1, new[] {32,32,32}},
{2, new[] {33,33,33}},
{3, new[] {34,34,34}},
{4, new[] {35,35,35}},
{5, new[] {36,36,36}},
{6, new[] {37,37,37}},
{7, new[] {38,38,38}},
{8, new[] {39,39,39}},
{9, new[] {40,40,40}},
{10, new[] {41,41,41}},
{11, new[] {42,42,42}},
{12, new[] {43,43,43}},
{13, new[] {44,4,44}},
{14, new[] {45,45,45}},
{15, new[] {46,46,46}},
{16, new[] {47,47,47}},
{17, new[] {48,48,48}},
{18, new[] {49,49,49}},
{19, new[] {50,50,50}},
{20, new[] {51,51,51}},
{21, new[] {52,52,52}},
{22, new[] {53,53,53}},
{23, new[] {54,54,54}},
{24, new[] {55,55,55}},
{25, new[] {56,56,56}},
{26, new[] {57,57,57}},
{27, new[] {58,58,58}},
{28, new[] {59,59,59}},
{29, new[] {60,60,60}},
{30, new[] {61,61,61}},
{31, new[] {62,62,62}},
{32, new[] {63,63,63}},
{33, new[] {64,64,64}},
{34, new[] {65,65,65}},
{35, new[] {66,66,66}},
{36, new[] {67,67,67}},
{37, new[] {68,68,68}},
{38, new[] {69,69,69}},
{39, new[] {70,70,70}},
{40, new[] {71,71,71}},
{41, new[] {72,72,72}},
{42, new[] {73,73,73}},
{43, new[] {74,74,74}},
{44, new[] {75,75,75}},
};
int[][] metrics = new int[longTag.Count][];
longTag.Values.CopyTo(metrics, 0);
int documentPages = reader.NumberOfPages;
for (int page = 1; page <= documentPages; page++)
{
PdfDictionary pageResources = reader.GetPageResources(page);
if (pageResources == null)
continue;
PdfDictionary pageFonts = pageResources.GetAsDict(PdfName.FONT);
if (pageFonts == null || pageFonts.Size == 0)
continue;
foreach (PdfName key in pageFonts.Keys)
{
PdfIndirectReference fontReference = pageFonts.GetAsIndirectObject(key);
if (fontReference == null)
continue;
DocumentFont font = (DocumentFont)BaseFont.CreateFont((PRIndirectReference)fontReference);
if (font == null)
continue;
var unicodeMap = GetToUnicode(metrics);
font.FontDictionary.Put(PdfName.TOUNICODE, unicodeMap);
}
}
return reader;
}
public static PdfStream GetToUnicode(Object[] metrics)
{
if (metrics.Length == 0)
return null;
StringBuilder buf = new StringBuilder(
"/CIDInit /ProcSet findresource begin\n" +
"12 dict begin\n" +
"begincmap\n" +
"/CIDSystemInfo\n" +
"<< /Registry (TTX+0)\n" +
"/Ordering (T42UV)\n" +
"/Supplement 0\n" +
">> def\n" +
"/CMapName /TTX+0 def\n" +
"/CMapType 2 def\n" +
"1 begincodespacerange\n" +
"<0000><FFFF>\n" +
"endcodespacerange\n");
int size = 0;
for (int k = 0; k < metrics.Length; ++k)
{
if (size == 0)
{
if (k != 0)
{
buf.Append("endbfrange\n");
}
size = Math.Min(100, metrics.Length - k);
buf.Append(size).Append(" beginbfrange\n");
}
--size;
int[] metric = (int[])metrics[k];
string fromTo = ToHex(metric[0]);
buf.Append(fromTo).Append(fromTo).Append(ToHex(metric[2])).Append('\n');
}
buf.Append(
"endbfrange\n" +
"endcmap\n" +
"CMapName currentdict /CMap defineresource pop\n" +
"end end\n");
string s = buf.ToString();
PdfStream stream = new PdfStream(PdfEncodings.ConvertToBytes(s, null));
stream.FlateCompress(-1);
return stream;
}
internal static string ToHex(int n)
{
if (n < 0x10000)
return "<" + System.Convert.ToString(n, 16).PadLeft(4, '0') + ">";
n -= 0x10000;
int high = (n / 0x400) + 0xd800;
int low = (n % 0x400) + 0xdc00;
return "[<" + System.Convert.ToString(high, 16).PadLeft(4, '0') + System.Convert.ToString(low, 16).PadLeft(4, '0') + ">]";
}