在我的Windows 8应用程序中,我想逐行读取PDF,然后我想分配一个String数组。我该怎么办?
public StringBuilder addd= new StringBuilder();
string[] array;
private async void btndosyasec_Click(object sender, RoutedEventArgs e)
{
FileOpenPicker openPicker = new FileOpenPicker();
openPicker.ViewMode = PickerViewMode.List;
openPicker.SuggestedStartLocation = PickerLocationId.PicturesLibrary;
openPicker.FileTypeFilter.Add(".pdf");
StorageFile file = await openPicker.PickSingleFileAsync();
if (file != null)
{
PdfReader reader = new PdfReader((await file.OpenReadAsync()).AsStream());
for (int page = 1; page <= reader.NumberOfPages; page++)
{
addd.Append(PdfTextExtractor.GetTextFromPage(reader, page));
string tmp= PdfTextExtractor.GetTextFromPage(reader, page);
array[page] = tmp.ToString();
reader.Close();
}
}
}
答案 0 :(得分:7)
您好我也有这个问题,我使用了这段代码,它确实有效。
您需要对iTextSharp库的引用。
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
PdfReader reader = new PdfReader(@"D:\test pdf\Blood Journal.pdf");
int intPageNum = reader.NumberOfPages;
string[] words;
string line;
for (int i = 1; i <= intPageNum; i++)
{
text = PdfTextExtractor.GetTextFromPage(reader, i, new LocationTextExtractionStrategy());
words = text.Split('\n');
for (int j = 0, len = words.Length; j < len; j++)
{
line = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(words[j]));
}
}
单词数组包含pdf文件行
答案 1 :(得分:0)
如果您要寻找免许可证/开放源代码,并且需要从PDF中提取基本文本,则可以使用 PdfClown ,它同时支持.Net Framework和以及.NET CORE(尽管测试版版本为.NET Standard 2.0)。有关更多信息或示例,请查看
https://www.nuget.org/packages/PdfClown.NetStandard/0.2.0-beta
https://sourceforge.net/p/clown/code/HEAD/tree/trunk/dotNET/pdfclown.samples.cli/
下面的示例是w.r.t .NET CORE。
public class PdfClownUtil
{
private static readonly string fileSrcPath = "MyTestDoc.pdf";
private readonly StringBuilder stringBuilder_1 = new StringBuilder();
public string GetPdfTextContent()
{
PdfDocuments.Document document = new File(fileSrcPath).Document;
StringBuilder stringBuilder_2 = new StringBuilder();
TextExtractor extractor = new TextExtractor();
foreach (Page page in document.Pages)
{
// Approach-1:
Extract(new ContentScanner(page));
// Approach-2 with additional Options:
IList<ITextString> textStrings = extractor.Extract(page)[TextExtractor.DefaultArea];
foreach (ITextString textString in textStrings)
{
stringBuilder_2.Append(textString.Text);
}
stringBuilder_2.AppendLine();
}
var content = stringBuilder_2.ToString();
return content;
}
// Approach-1:
private void Extract(ContentScanner level)
{
if (level == null)
{
return;
}
while (level.MoveNext())
{
ContentObject content = level.Current;
if (content is ShowText)
{
Font font = level.State.Font;
// Extract the current text chunk, decoding it!
this.stringBuilder_1.Append(font.Decode(((ShowText)content).Text));
}
else if (content is Text || content is ContainerObject)
{
// Scan the inner level!
Extract(level.ChildLevel);
}
}
}
}
答案 2 :(得分:0)
以下代码适用于iText7
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
public void ExtractTextFromPDF(string filePath)
{
PdfReader pdfReader = new PdfReader(filePath);
PdfDocument pdfDoc = new PdfDocument(pdfReader);
for (int page = 1; page <= pdfDoc.GetNumberOfPages(); page++)
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string pageContent = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(page), strategy);
Console.WriteLine("pageContent : " + pageContent);
}
pdfDoc.Close();
pdfReader.Close();
}