我创建了一个脚本,用于在一个文件夹中查找 pdf 中的单词,然后如果找到它会将 pdf 移动到另一个文件夹。
from pathlib import Path
import PyPDF2
import re
import os
import shutil
pattern = input("Enter string pattern to search: ")
basepath = Path('\hrdinhal\Data\Desktop\Analize\Search engine')
src = basepath / 'Folder 1'
dst = basepath / 'Folder 2'
for file_name in os.scandir(src):
file = PyPDF2.PdfFileReader(str(src / file_name), 'rb')
numPages = file.getNumPages()
for i in range(0, numPages):
pageObj = file.getPage(i)
text = pageObj.extractText()
for match in re.findall(pattern, text, re.IGNORECASE):
shutil.copyfile(str(src / file_name), str(dst / file_name))
当我运行它时出现错误:
SameFileError: '\\hrdinhal\\Data\\Desktop\\Analize\\Search engine\\Folder 1\\Daily Production Summary 1.pdf' and '\\hrdinhal\\Data\\Desktop\\Analize\\Search engine\\Folder 1\\Daily Production Summary 1.pdf' are the same file
出于某种原因,它需要 dst 并用 src 替换它。为什么?以及如何修复它?
dst
Out[99]: WindowsPath('/hrdinhal/Data/Desktop/Analize/Search engine/Folder 2')
file_name
Out[100]: <DirEntry 'Daily Production Summary 1.pdf'>
dst/file_name
Out[101]: WindowsPath('/hrdinhal/Data/Desktop/Analize/Search engine/Folder 1/Daily Production Summary 1.pdf')
它将文件夹 2 更改为文件夹 1!
答案 0 :(得分:0)
我发现 file_name
保留了文件的完整路径,此路径替换了
src
、dst
src / file_name
dst / file_name
您只需要获得名称 file_name.name
src / file_name.name
dst / file_name.name
顺便说一句:
完整路径
print( file_name.path )
只有文件名
print( file_name.name )
顺便说一句:你每次比赛后都复制同一个文件,但你只能做一次
使用变量 found
并在 for i
循环后复制
from pathlib import Path
import PyPDF2
import re
import os
import shutil
pattern = input("Enter string pattern to search: ")
basepath = Path('\hrdinhal\Data\Desktop\Analize\Search engine')
src = basepath / 'Folder 1'
dst = basepath / 'Folder 2'
#print('[DEBUG] (before for file_name) src:', src)
for file_name in os.scandir(src):
file = PyPDF2.PdfFileReader(str(src / file_name.name), 'rb')
numPages = file.getNumPages()
found = False
# ---
#print('[DEBUG] (before for i) src:', src)
for i in range(0, numPages):
pageObj = file.getPage(i)
text = pageObj.extractText()
#print('[DEBUG] (before if re) src:', src)
if re.findall(pattern, text, re.IGNORECASE):
found = True
# ----
#print('[DEBUG] (before for found) src:', src)
if found:
#print('[DEBUG] (before copy) src:', src)
shutil.copyfile(str(src / file_name.name), str(dst / file_name.name))
或在第一次复制后使用 break
跳过 for i
循环
from pathlib import Path
import PyPDF2
import re
import os
import shutil
pattern = input("Enter string pattern to search: ")
basepath = Path('\hrdinhal\Data\Desktop\Analize\Search engine')
src = basepath / 'Folder 1'
dst = basepath / 'Folder 2'
#print('[DEBUG] (before for file_name) src:', src)
for file_name in os.scandir(src):
#print('[DEBUG] (before pyPDF2) file_name:', file_name)
file = PyPDF2.PdfFileReader(str(src / file_name.name), 'rb')
numPages = file.getNumPages()
# ---
#print('[DEBUG] (before for i) src:', src)
for i in range(0, numPages):
pageObj = file.getPage(i)
text = pageObj.extractText()
#print('[DEBUG] (before if re) src:', src)
if re.findall(pattern, text, re.IGNORECASE):
#print('[DEBUG] (before copy) src:', src)
shutil.copyfile(str(src / file_name.name), str(dst / file_name.name))
break # there is no need to check rest of PDF