我正在尝试从pdf导入原始文本数据,并且250 μm
的{{1}}与pdf_text
pdftools
之间的文字一样,它删除了特殊字符并给我250 m
,我从read_pdf
尝试了textreadr
它给了我<U+0095>m
任何获取原始数据的提示,而不会弄乱团结。
我无法分享来源(pdf) 我的剧本:
library(pdftools)
library(tidyverse)
library(tidytext)
library(textreadr)
all_pdfs <- list.files(pattern = ".pdf$")
dat_txt <- map_df(all_pdfs, ~ data_frame(text = pdf_text(.x),
titre = pdf_info(.x)$keys$Title ,
auteur = pdf_info(.x)$keys$Author
)) %>% mutate(
text = str_replace_all(text,"dr\\.|DR\\.|Dr\\.", "dr" ) , # suppression . après dr
text = str_replace_all(text,"e\\.g\\.", "eg" ), # idem pour e.g.
text = str_replace_all(text,"Page\\s[[:digit:]]\\sof\\s[[:digit:]]", "" ) # supression N° page of
) %>%
unnest_tokens(text, text , token ="sentences")
编辑元数据
[1] "<?xpacket begin=''
id='W5M0MpCehiHzreSzNTczkc9d'?>
<?adobe-xap-filters esc=\"CRLF\"?>
<x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'>
<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' xmlns:iX='http://ns.adobe.com/iX/1.0/'>\
<rdf:Description rdf:about='uuid:110d9367-d653-11e5-0000-9c55a6090709' xmlns:pdf='http://ns.adobe.com/pdf/1.3/' pdf:Producer='GPL Ghostscript 9.10'/>
<rdf:Description rdf:about='uuid:110d9367-d653-11e5-0000-9c55a6090709' xmlns:xmp='http://ns.adobe.com/xap/1.0/'><xmp:ModifyDate>2016-02-15T16:19:55+01:00</xmp:ModifyDate>
<xmp:CreateDate>2016-02-15T16:19:55+01:00</xmp:CreateDate>
<xmp:CreatorTool>FreePDF 4.09f - http://shbox.de</xmp:CreatorTool></rdf:Description><rdf:Description rdf:about='uuid:110d9367-d653-11e5-0000-9c55a6090709' xmlns:xapMM='http://ns.adobe.com/xap/1.0/mm/' xapMM:DocumentID='uuid:110d9367-d653-11e5-0000-9c55a6090709'/>
<rdf:Description rdf:about='uuid:110d9367-d653-11e5-0000-9c55a6090709' xmlns:dc='http://purl.org/dc/elements/1.1/' dc:format='application/pdf'><dc:title><rdf:Alt><rdf:li xml:lang='x-default'>confidentiel</rdf:li></rdf:Alt></dc:title><dc:creator><rdf:Seq><rdf:li>createur</rdf:li></rdf:Seq></dc:creator></rdf:Description>
</rdf:RDF>
</x:xmpmeta>
<?xpacket end='w'?>"