我正在努力创建一个抓取工具,使.csv文件列出域中每个页面的标题和网址。它似乎很好地抓取了网站,但是当它试图解析标题时我得到一个错误:
line 38: syntax error near unexpected token `done'
有人可以帮我消除这个错误的原因吗?
这是我正在使用的脚本
#!/bin/bash
# Crawls a domain
# Retreives all visible URLs and their page titles
# Saves to CSV
# USAGE:
# save this script as, say, “spiderbot.sh”.
# Then “chmod +x spiderbot.sh”.
# Then run the script passing in the name of this site you want to crawl, for example ` ./spiderbot.sh http://www.someSite.com `
# Text color variables
txtund=$(tput sgr 0 1)
# Underline
txtbld=$(tput bold)
# Bold
bldred=${txtbld}$(tput setaf 1) # red
bldblu=${txtbld}$(tput setaf 4) # blue
bldgreen=${txtbld}$(tput setaf 2) # green
bldwht=${txtbld}$(tput setaf 7) # white
txtrst=$(tput sgr0) # Reset
info=${bldwht}*${txtrst} # Feedback
pass=${bldblu}*${txtrst}
warn=${bldred}*${txtrst}
ques=${bldblu}?${txtrst}
printf "%s=== Crawling $1 === %s" "$bldgreen" "$txtrst"
# wget in Spider mode, outputs to wglog file
# -R switch to ignore specific file types (images, javascript etc.)
wget --reject-regex "(.*)\?(.*)" --no-check-certificate --spider -r -l inf -w .25 -nc -nd $1 -R bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,swf,txt,xml,xls,zip 2>&1 | tee wglog v
printf " %s========================================== " "$bldgreen"
printf "%s=== Crawl Finished... ===%s " "$bldgreen" "$txtrst"
printf "%s=== Begin retreiving page titles... ===%s " "$bldgreen" "$txtrst"
printf "%s========================================== " "$bldgreen"
printf "%s** Run tail -f $1.csv for progress%s " "$bldred" "$txtrst"
# from wglog, grab URLs
# curl each URL and grep title
cat wglog | grep '^--' | awk '{print $3}' | sort | uniq | while read url;
do { printf "%s* Retreiving title for: %s$url%s " "$bldgreen" "$txtrst$txtbld" "$txtrst" printf ""${url}","`curl -# ${url} | sed -n -e 's!.*<title>(.*)</title>.*!1!p'`" " >> $1.csv printf " "};
done
# clean up log file
rm wglog
exit
答案 0 :(得分:1)
您需要在do { ... }
即do { ...; }