Question

我使用pdfbox实现了从任何pdf中提取单词的功能。它逐行提取单词。以下是我为此目的使用的课程：

    class PDFTextLocationStripper : PDFTextStripper
    {
        public string textWithPostion = "";
        public Dictionary<float, Dictionary<float, PdfWord>> pdfWordsByXByY;

        public PDFTextLocationStripper(): base()
        {
            try
            {
                textWithPostion = "";
                pdfWordsByXByY = new Dictionary<float, Dictionary<float, PdfWord>>();
            }
            catch (Exception ex)
            {

            }
        }

        protected override void processTextPosition(TextPosition text)
        {
            try
            {
                float textX = text.getXDirAdj();
                float textY = text.getYDirAdj();
                if (!String.IsNullOrWhiteSpace(text.getUnicode()))
                {
                    if (pdfWordsByXByY.ContainsKey(textY))
                    {
                        Dictionary<float, PdfWord> wordsByX = pdfWordsByXByY[textY];
                        if (wordsByX.ContainsKey(textX))
                        {
                            PdfWord word = wordsByX[textX];
                            wordsByX.Remove(word.Right);
                            word.EndCharWidth = text.getWidthDirAdj();
                            if (text.getHeightDir() > word.Height)
                            {
                                word.Height = text.getHeightDir(); 
                            }
                            word.EndX = textX;
                            word.Text += text.getUnicode();
                            if (!wordsByX.Keys.Contains(word.Right))
                            {
                                wordsByX.Add(word.Right, word);
                            }
                        }
                        else
                        {
                            float requiredX = -1;
                            float minDiff = float.MaxValue;
                            for (int index = 0; index < wordsByX.Keys.Count; index++)
                            {
                                float key = wordsByX.Keys.ElementAt(index);
                                float diff = key - textX;
                                if (diff < 0)
                                {
                                    diff = -diff;
                                }
                                if (diff < minDiff)
                                {
                                    minDiff = diff;
                                    requiredX = key;
                                }
                            }
                            if (requiredX > -1 && minDiff <= 1)
                            {
                                PdfWord word = wordsByX[requiredX];
                                wordsByX.Remove(requiredX);
                                word.EndCharWidth = text.getWidthDirAdj();
                                if (text.getHeightDir() > word.Height)
                                {
                                    word.Height = text.getHeightDir(); 
                                }
                                word.EndX = textX;
                                word.Text += text.getUnicode();
                                if (!wordsByX.ContainsKey(word.Right))
                                {
                                    wordsByX.Add(word.Right, word);
                                }
                            }
                            else
                            {
                                PdfWord word = new PdfWord();
                                word.Text = text.getUnicode();
                                word.EndX = word.StartX = textX;
                                word.Y = textY;
                                word.EndCharWidth = word.StartCharWidth = text.getWidthDirAdj();
                                word.Height = text.getHeightDir();
                                if (!wordsByX.ContainsKey(word.Right))
                                {
                                    wordsByX.Add(word.Right, word);
                                }
                                pdfWordsByXByY[textY] = wordsByX;
                            }
                        }
                    }
                    else
                    {
                        Dictionary<float, PdfWord> wordsByX = new Dictionary<float, PdfWord>();


        PdfWord word = new PdfWord();
                    word.Text = text.getUnicode();
                    word.EndX = word.StartX = textX;
                    word.Y = textY;
                    word.EndCharWidth = word.StartCharWidth = text.getWidthDirAdj();
                    word.Height = text.getHeightDir();
                    wordsByX.Add(word.Right, word);
                    pdfWordsByXByY.Add(textY, wordsByX);
                }
            }
        }
        catch (Exception ex)
        {

        }
    }
}

这里是调用此类的代码：

        private Dictionary<float, Dictionary<float, PdfWord>> ExtractTextWithLocation(PDDocument doc)
        {
            try
            {
                PDFTextLocationStripper textStripper = new PDFTextLocationStripper();
                textStripper.setSortByPosition(true);
                textStripper.getText(doc);
                return textStripper.pdfWordsByXByY;
            }
            catch (Exception ex)
            {
                return null;
            }
        }

此代码提取水平对齐的单词，但如何实现检测垂直或倾斜单词的功能？

如何使用pdfbox从pdf文件中提取垂直/倾斜文本以及边界？

0 个答案: