我需要一个bash脚本,它从pdf中提取所有光栅和矢量图像,并将它们转换为jpg格式。
我在网上查了很多帖子,我从这些中得到了大部分的想法
How can I extract images from a PDF file?
Count the number of the raster images in the pdf
How to extract a vector figure from pdf?
它有效并且我分享它,因为我没有像这样在网上找到解决方案。
但到目前为止我还无法解决两个小问题。
pdf2svg
会将文本确定为矢量图像,并生成带有文本的额外图像。
有没有办法将文字与真实矢量图像区分开来? pdf2svg
将生成一个包含所有矢量图像的SVG图像(与页面包含文本相同)。 是否可以将它们提取为单独的图像? bash脚本
#!/bin/bash
TMP_DIR=$1
SOURCE_PDF=$2
MAX_WIDTH=1920
MAX_HEIGHT=1080
echo "source: $SOURCE_PDF"
function burst
{
local source=$1
# explodes the pages to pdf files (it is necessary for the vector images export)
`/usr/bin/pdftk $source burst`
# removes the source pdf (we do not need it any more)
`rm $source`
# and the txt files which were generated by the pdftk
`rm *.txt`
}
# finds the pages as pdf files and call check_for_images function
function process_pages {
local tmp_dir=$1
local pnum=1
for f in `find . -type f -name "*.pdf"`
do
echo "processing page $f"
check_for_images $f $pnum
let "pnum++"
done
}
function check_for_images {
local pdf_page=$1
local pnum=$2
# checks whether the page contains a raster image
list_raster_images=`/usr/bin/pdfimages -list $pdf_page | grep -E "(jpeg|png|gif)"`
is_raster_images=${#list_raster_images}
if (( $is_raster_images > 0 )); then
# it contains raster image(s), extract them
extract_raster_images $pdf_page $pnum
else
# it does not contain raster image(s), try to extract vector images
extract_vector_images $pdf_page $pnum
fi;
rm $pdf_page
}
function extract_raster_images {
local pdf_page=$1
local pnum=$2
pdf_file="${pdf_page%.*}"
echo "extract all raster image(s) from this page";
`/usr/bin/pdfimages -all $pdf_page ./`
# we need to use a very same file name convention so this part renames them
# who knows it might be useful later
for f in `find . -regextype sed -regex ".*/-[0-9]\{3\}\.jpg"`
do
path=$(dirname $f)
img_file=$(basename $f)
img_ext="${img_file##*.}"
img_num="${img_file%.*}"
mv $f $path/$pdf_file$img_num.$img_ext
done
}
function extract_vector_images {
local pdf_page=$1
local pnum=$2
pdf_file="${pdf_page%.*}"
echo "extract vector image from the page as SVG"
`/usr/bin/pdf2svg $pdf_page $pdf_page.svg`
# just to be sure it is not a raster image
is_raster_image=`grep -c -i "data:image" $pdf_page.svg`
if (( $is_raster_images == 0 )); then
# convert SVG to PNG (it doesn't know JPG format) with fixed sizes, but keep the aspect ratio
`/usr/bin/rsvg-convert -a -w $MAX_WIDTH -h $MAX_HEIGHT -f png -o $pdf_page.png $pdf_page.svg`
# convert PNG to JPG
`convert $pdf_page.png -background white -flatten -alpha off $pdf_file-000.jpg`
fi;
`rm *.svg`
`rm *.png`
}
cd $TMP_DIR
burst $SOURCE_PDF
process_pages $TMP_DIR
从php执行它
$tmpName = basename($file['tmp_name']);
$tmpDir = '/path-of-tmp-dir' . $tmpName . '_extraction';
mkdir($tmpDir);
$command = "extract_pdf_images.sh $tmpDir ".$file['tmp_name'];
exec($command);
要求
apt-get install pdftk pdfimages pdf2svg librsvg2-bin imagick