PDFNet:从Rect中提取文本时添加了空格

时间:2016-12-22 10:58:13

标签: c# pdftron

我需要使用PDF文件的C#(Pd​​fNet)

从PDF格式的高光中提取文本

我使用包含这些方法的Helper类创建高亮显示,以便从textseelction创建高亮显示:

private pdftron.SDF.Obj CreateHighlightAppearance(pdftron.PDF.Rect bbox)
{
    ElementBuilder build = new ElementBuilder();
    ElementWriter writer = new ElementWriter();
    writer.Begin(m_document);

    // Draw background 
    Element element = build.CreateRect(bbox.x1 - 2, bbox.y1, bbox.x2 + 2, bbox.y2); 
    element.SetPathFill(true);
    element.SetPathStroke(false);
    GState gs = element.GetGState();
    gs.SetFillColorSpace(ColorSpace.CreateDeviceRGB());
    gs.SetBlendMode(GState.BlendMode.e_bl_multiply);
    writer.WriteElement(element);
    pdftron.SDF.Obj stm = writer.End();

    build.Dispose();
    writer.Dispose();

    // Set the bounding box 
    stm.PutRect("BBox", bbox.x1, bbox.y1, bbox.x2, bbox.y2);
    stm.PutName("Subtype", "Form");
    return stm;
}

public Annot CreateHighlightAnnot(pdftron.PDF.Rect rect)
{
    Annot a = Annot.Create(m_document, Annot.Type.e_Highlight, rect);
    a.SetAppearance(CreateHighlightAppearance(rect));

    pdftron.SDF.Obj quads = a.GetSDFObj().PutArray("QuadPoints");
    quads.PushBackNumber(rect.x1);
    quads.PushBackNumber(rect.y2);
    quads.PushBackNumber(rect.x2);
    quads.PushBackNumber(rect.y2);
    quads.PushBackNumber(rect.x1);
    quads.PushBackNumber(rect.y1);
    quads.PushBackNumber(rect.x2);
    quads.PushBackNumber(rect.y1);

    return a;
 }

public void AddHighlights()
{
    PDFViewCtrl.Selection selection = m_pdfViewer.GetSelection();

    int pageIndex = m_pdfViewer.GetCurrentPage();
    pdftron.PDF.Page page = m_pdfViewer.GetDoc().GetPage(pageIndex);

    if (m_document != null)
    {
        int pageNumber = selection.GetPageNum(); 
        double[] quads = selection.GetQuads();
        int numQuads = quads.Length / 8;

        if (quads.Length % 8 == 0) //must have at least 8 points to be valid
        {
            for (int i = 0; i < numQuads; i++)
            {
                Rect selectionRect = GetSelectionRect(ref quads, i);
                //Console.WriteLine("GetRectsFromQuads - aRect: " + rectX1.ToString() + " | " + rectY1.ToString() + " | " + rectX2.ToString() + " | " + rectY2.ToString());

                Annot highlightAnnot = CreateHighlightAnnot(selectionRect);

                //remove any underlying highlight, to work with different colors
                m_pdfViewer.RemoveHighlightAnnotationFromPage(highlightAnnot.GetRect(), pageNumber);                        
                m_pdfViewer.AddHighlightAnnotationToPage(highlightAnnot, true);
            }

            m_pdfViewer.SetDocumentModified();
            m_pdfViewer.ClearSelection();
        }
    }
}

这是我用来提取文本的方法。问题是在contentStr的末尾总是添加一个空格,这很烦人,因为空白并不总是突出显示

private string GetTextFromRect(Rect rect, pdftron.PDF.Page page, Annot annot)
{
    string contentStr = "";
    TextExtractor txtExtractor = new TextExtractor();
    txtExtractor.Begin(page, rect);
    contentStr += txtExtractor.GetTextUnderAnnot(annot);
    return contentStr;
}

0 个答案:

没有答案