我正在尝试使用iTextSharp(v5.5.12.1)从以下PDF中提取文本: https://structure.mil.ru/files/morf/military/files/ENGV_1929.pdf
public static string ExtractTextFromPdf(Stream pdfStream, bool addNewLineBetweenPages = false)
using (PdfReader reader = new PdfReader(pdfStream))
string text = "";
for (int i = 1; i <= reader.NumberOfPages; i++)
text += PdfTextExtractor.GetTextFromPage(reader, i);
if (addNewLineBetweenPages && i != reader.NumberOfPages)
text += Environment.NewLine;
return text;
答案 0 :(得分:1)
这里的问题是嵌入式字体程序中的字形具有非标准字形名称( G00 , G01 ,...),并且只能由字形标识名称。因此,必须建立从这些字形名称到Unicode字符的映射。一个人可以这样做通过检查PDF中的字体程序(例如使用字体伪造)并通过名称直观地识别字形。例如。就像这里
然后,您必须将这些映射注入到iText中。由于映射是隐藏的(private static
void InitializeGlyphs()
FieldInfo names2unicodeFiled = typeof(GlyphList).GetField("names2unicode", BindingFlags.Instance | BindingFlags.NonPublic | BindingFlags.Static);
Dictionary<string, int[]> names2unicode = (Dictionary<string, int[]>) names2unicodeFiled.GetValue(null);
names2unicode["G03"] = new int[] { ' ' };
names2unicode["G0A"] = new int[] { '\'' };
names2unicode["G0B"] = new int[] { '(' };
names2unicode["G0C"] = new int[] { ')' };
names2unicode["G0F"] = new int[] { ',' };
names2unicode["G10"] = new int[] { '-' };
names2unicode["G11"] = new int[] { '.' };
names2unicode["G12"] = new int[] { '/' };
names2unicode["G13"] = new int[] { '0' };
names2unicode["G14"] = new int[] { '1' };
names2unicode["G15"] = new int[] { '2' };
names2unicode["G16"] = new int[] { '3' };
names2unicode["G17"] = new int[] { '4' };
names2unicode["G18"] = new int[] { '5' };
names2unicode["G19"] = new int[] { '6' };
names2unicode["G1A"] = new int[] { '7' };
names2unicode["G1B"] = new int[] { '8' };
names2unicode["G1C"] = new int[] { '9' };
names2unicode["G1D"] = new int[] { ':' };
names2unicode["G23"] = new int[] { '@' };
names2unicode["G24"] = new int[] { 'A' };
names2unicode["G25"] = new int[] { 'B' };
names2unicode["G26"] = new int[] { 'C' };
names2unicode["G27"] = new int[] { 'D' };
names2unicode["G28"] = new int[] { 'E' };
names2unicode["G29"] = new int[] { 'F' };
names2unicode["G2A"] = new int[] { 'G' };
names2unicode["G2B"] = new int[] { 'H' };
names2unicode["G2C"] = new int[] { 'I' };
names2unicode["G2D"] = new int[] { 'J' };
names2unicode["G2E"] = new int[] { 'K' };
names2unicode["G2F"] = new int[] { 'L' };
names2unicode["G30"] = new int[] { 'M' };
names2unicode["G31"] = new int[] { 'N' };
names2unicode["G32"] = new int[] { 'O' };
names2unicode["G33"] = new int[] { 'P' };
names2unicode["G34"] = new int[] { 'Q' };
names2unicode["G35"] = new int[] { 'R' };
names2unicode["G36"] = new int[] { 'S' };
names2unicode["G37"] = new int[] { 'T' };
names2unicode["G38"] = new int[] { 'U' };
names2unicode["G39"] = new int[] { 'V' };
names2unicode["G3A"] = new int[] { 'W' };
names2unicode["G3B"] = new int[] { 'X' };
names2unicode["G3C"] = new int[] { 'Y' };
names2unicode["G3D"] = new int[] { 'Z' };
names2unicode["G42"] = new int[] { '_' };
names2unicode["G44"] = new int[] { 'a' };
names2unicode["G45"] = new int[] { 'b' };
names2unicode["G46"] = new int[] { 'c' };
names2unicode["G46._"] = new int[] { 'c' };
names2unicode["G47"] = new int[] { 'd' };
names2unicode["G48"] = new int[] { 'e' };
names2unicode["G49"] = new int[] { 'f' };
names2unicode["G4A"] = new int[] { 'g' };
names2unicode["G4B"] = new int[] { 'h' };
names2unicode["G4C"] = new int[] { 'i' };
names2unicode["G4D"] = new int[] { 'j' };
names2unicode["G4E"] = new int[] { 'k' };
names2unicode["G4F"] = new int[] { 'l' };
names2unicode["G50"] = new int[] { 'm' };
names2unicode["G51"] = new int[] { 'n' };
names2unicode["G52"] = new int[] { 'o' };
names2unicode["G53"] = new int[] { 'p' };
names2unicode["G54"] = new int[] { 'q' };
names2unicode["G55"] = new int[] { 'r' };
names2unicode["G56"] = new int[] { 's' };
names2unicode["G57"] = new int[] { 't' };
names2unicode["G58"] = new int[] { 'u' };
names2unicode["G59"] = new int[] { 'v' };
names2unicode["G5A"] = new int[] { 'w' };
names2unicode["G5B"] = new int[] { 'x' };
names2unicode["G5C"] = new int[] { 'y' };
names2unicode["G5D"] = new int[] { 'z' };
names2unicode["G62"] = new int[] { 'Ш' };
names2unicode["G63"] = new int[] { 'Р' };
names2unicode["G6A"] = new int[] { 'И' };
names2unicode["G6B"] = new int[] { 'А' };
names2unicode["G6C"] = new int[] { 'М' };
names2unicode["G6D"] = new int[] { 'в' };
names2unicode["G6E"] = new int[] { 'Ф' };
names2unicode["G70"] = new int[] { 'Е' };
names2unicode["G72"] = new int[] { 'Б' };
names2unicode["G73"] = new int[] { 'Н' };
names2unicode["G76"] = new int[] { 'С' };
names2unicode["G7A"] = new int[] { 'К' };
names2unicode["G7B"] = new int[] { 'В' };
names2unicode["G7C"] = new int[] { 'О' };
names2unicode["G7D"] = new int[] { 'к' };
names2unicode["G7E"] = new int[] { 'З' };
names2unicode["G80"] = new int[] { 'Г' };
names2unicode["G81"] = new int[] { 'П' };
names2unicode["G82"] = new int[] { 'у' };
names2unicode["G85"] = new int[] { '»' };
names2unicode["G88"] = new int[] { 'т' };
names2unicode["G8D"] = new int[] { '’' };
names2unicode["G90"] = new int[] { 'У' };
names2unicode["G91"] = new int[] { 'Т' };
names2unicode["GA1"] = new int[] { 'Ц' };
names2unicode["GA2"] = new int[] { '№' };
names2unicode["GAA"] = new int[] { 'э' };
names2unicode["GAB"] = new int[] { 'я' };
names2unicode["GAC"] = new int[] { 'і' };
names2unicode["GAD"] = new int[] { 'б' };
names2unicode["GAE"] = new int[] { 'й' };
names2unicode["GAF"] = new int[] { 'р' };
names2unicode["GB0"] = new int[] { 'с' };
names2unicode["GB2"] = new int[] { 'х' };
names2unicode["GB5"] = new int[] { '“' };
names2unicode["GB9"] = new int[] { 'п' };
names2unicode["GBA"] = new int[] { 'о' };
names2unicode["GBD"] = new int[] { '«' };
names2unicode["GC1"] = new int[] { 'ф' };
names2unicode["GC8"] = new int[] { 'а' };
names2unicode["GCB"] = new int[] { 'е' };
names2unicode["GCE"] = new int[] { 'ж' };
names2unicode["GCF"] = new int[] { 'з' };
names2unicode["GD2"] = new int[] { 'и' };
names2unicode["GD3"] = new int[] { 'н' };
names2unicode["GDC"] = new int[] { '–' };
names2unicode["GE3"] = new int[] { 'л' };
using (FileStream pdfStream = new FileStream(@"ENGV_1929.pdf", FileMode.Open))
string result = ExtractTextFromPdf(pdfStream, true);
File.WriteAllText(@"ENGV_1929.txt", result);
From Notices to Mariners
Edition No 29/2019
(English version)
Notiсes to Mariners from Seсtion II «Сharts Сorreсtion», based on the original sourсe information, and
NAVAREA XIII, XX and XXI navigational warnings are reprinted hereunder in English. Original Notiсes to
Mariners from Seсtion I «Misсellaneous Navigational Information» and from Seсtion III «Nautiсal
Publiсations Сorreсtion» may be only briefly annotated and/or a referenсe may be made to Notiсes from
other Seсtions. Information from Seсtion IV «Сatalogues of Сharts and Nautiсal Publiсations Сorreсtion»
сonсerning the issue of сharts and publiсations is presented with details.
Digital analogue of English version of the extracts from original Russian Notices to Mariners is available
by: http://structure.mil.ru/structure/forces/hydrographic/info/notices.htm
Вarents Sea
3493 Сharts 18012, 17052, 15005, 15004
Amend 1. Light to light Fl G 4s 1M at
front leading lightbeacon 69111’32.2“N 33129’48.0“E
2. Light to light Fl G 4s 1M at
rear leading lightbeacon 69111’34.85“N 33129’44.25“E
Cancel coastal warning