Question

我创建了一个脚本，用于在一个文件夹中查找 pdf 中的单词，然后如果找到它会将 pdf 移动到另一个文件夹。

from pathlib import Path
import PyPDF2
import re
import os
import shutil

pattern = input("Enter string pattern to search: ")

basepath = Path('\hrdinhal\Data\Desktop\Analize\Search engine')

src = basepath / 'Folder 1'
dst = basepath / 'Folder 2'


for file_name in os.scandir(src):
    file = PyPDF2.PdfFileReader(str(src / file_name), 'rb')
    numPages = file.getNumPages()

    for i in range(0, numPages):
        pageObj = file.getPage(i)
        text = pageObj.extractText()
        
        for match in re.findall(pattern, text, re.IGNORECASE):
            shutil.copyfile(str(src / file_name), str(dst / file_name))

当我运行它时出现错误：

SameFileError: '\\hrdinhal\\Data\\Desktop\\Analize\\Search engine\\Folder 1\\Daily Production Summary 1.pdf' and '\\hrdinhal\\Data\\Desktop\\Analize\\Search engine\\Folder 1\\Daily Production Summary 1.pdf' are the same file

出于某种原因，它需要 dst 并用 src 替换它。为什么？以及如何修复它？

dst
Out[99]: WindowsPath('/hrdinhal/Data/Desktop/Analize/Search engine/Folder 2')

file_name
Out[100]: <DirEntry 'Daily Production Summary 1.pdf'>

dst/file_name
Out[101]: WindowsPath('/hrdinhal/Data/Desktop/Analize/Search engine/Folder 1/Daily Production Summary 1.pdf')

它将文件夹 2 更改为文件夹 1！

Answer 1

我发现 file_name 保留了文件的完整路径，此路径替换了

中的 src、dst

src / file_name 
dst / file_name

您只需要获得名称 file_name.name

src / file_name.name
dst / file_name.name

顺便说一句：

完整路径

print( file_name.path )

只有文件名

print( file_name.name )

顺便说一句：你每次比赛后都复制同一个文件，但你只能做一次

使用变量 found 并在 for i 循环后复制

from pathlib import Path
import PyPDF2
import re
import os
import shutil

pattern = input("Enter string pattern to search: ")

basepath = Path('\hrdinhal\Data\Desktop\Analize\Search engine')

src = basepath / 'Folder 1'
dst = basepath / 'Folder 2'

#print('[DEBUG] (before for file_name) src:', src)

for file_name in os.scandir(src):

    file = PyPDF2.PdfFileReader(str(src / file_name.name), 'rb')
    numPages = file.getNumPages()

    found = False

    # ---

    #print('[DEBUG] (before for i) src:', src)
    
    for i in range(0, numPages):
        pageObj = file.getPage(i)
        text = pageObj.extractText()

        #print('[DEBUG] (before if re) src:', src)

        if re.findall(pattern, text, re.IGNORECASE):
            found = True
            
    # ----

    #print('[DEBUG] (before for found) src:', src)
    
    if found:
        #print('[DEBUG] (before copy) src:', src)
        shutil.copyfile(str(src / file_name.name), str(dst / file_name.name))

或在第一次复制后使用 break 跳过 for i 循环

from pathlib import Path
import PyPDF2
import re
import os
import shutil

pattern = input("Enter string pattern to search: ")

basepath = Path('\hrdinhal\Data\Desktop\Analize\Search engine')

src = basepath / 'Folder 1'
dst = basepath / 'Folder 2'

#print('[DEBUG] (before for file_name) src:', src)

for file_name in os.scandir(src):

    #print('[DEBUG] (before pyPDF2) file_name:', file_name)

    file = PyPDF2.PdfFileReader(str(src / file_name.name), 'rb')
    numPages = file.getNumPages()

    # ---

    #print('[DEBUG] (before for i) src:', src)
    
    for i in range(0, numPages):
        pageObj = file.getPage(i)
        text = pageObj.extractText()

        #print('[DEBUG] (before if re) src:', src)

        if re.findall(pattern, text, re.IGNORECASE):
            #print('[DEBUG] (before copy) src:', src)
            shutil.copyfile(str(src / file_name.name), str(dst / file_name.name))
            break # there is no need to check rest of PDF

由于覆盖位置而导致的 SameFileError

1 个答案: