我想构建一个抓取(部分)网站的Chrome扩展程序。 从弹出窗口我想从后台调用异步方法,我在那里做一些阻塞请求。在此调用之后,我想通过请求setInterval的一些信息在弹出窗口中显示状态...在我开始该过程之后,弹出窗口被锁定,直到后台进程结束。我尝试了许多解决方案,比如在后台收到消息后返回true,在后台启动异步进程但我没有解决方案... 这是我的代码:
清单
{
"manifest_version": 2,
"name": "...",
"description": "...",
"version": "1.0",
"browser_action": {
"default_icon": "logo_76.png",
"default_popup": "main.htm"
},
"background": {
"scripts": [
"lib/regex_utils.js",
"lib/uri.js",
"lib/uri_utils.js",
"lib/bloomfilter.js",
"background.js"
],
"persistent": true
},
"content_scripts":[{
"matches": ["*://*/*"],
"js": ["content.js"]
}],
"permissions": [
"activeTab",
"http://*/*",
"https://*/*"
]
}
弹出
var t = null;
var port = null;
function showStatus(response){
// document.getElementById('start').innerHTML = 'parsed ' + response['parsed']
console.log(response);
}
function checkStatus(){
console.log('check')
if(port){
var data = {};
data['command'] = 'status';
port.postMessage(data);
}
}
clearInterval(t); t = setInterval(checkStatus, 2000);
document.addEventListener('DOMContentLoaded', function() {
var checkPageButton = document.getElementById('start');
checkPageButton.addEventListener('click', function() {
chrome.tabs.getSelected(null, function(tab) {
port = chrome.extension.connect({name: "communication"});
var data = {}
data['command'] = 'start_crawl'
data['url'] = tab['url']
port.postMessage(data);
port.onMessage.addListener(function(msg) {
if(msg['parsed'] != undefined){
showStatus(msg);
}
});
});
}, false);
}, false);
Bakground
function getLinksFromUrl(url){
var xhr = new XMLHttpRequest();
var data = null;
var base = null;
xhr.open("GET", url, false);
function handleStateChange() {
if (xhr.readyState == 4 && xhr.status==200) {
data = xhr.responseText;
}
}
xhr.onreadystatechange = handleStateChange;
try {
xhr.send();
}catch(err){
console.log('request exception');
}
if(data){
linkRegex = new RegExp('a[^>]+?href=["\']{0,1}([^"\'\\s>]+)','igm');
baseRegex = new RegExp('base[^>]+?href=["\']{0,1}([^"\'\\s>]+)','igm');
var matches = linkRegex.execAll(data);
var base = baseRegex.exec(data);
if (base && base[1] != undefined){
base = base[1]
}
var rawLinks = matches.map(function(e){
return e[1];
});
// Check to be on the same
var finalLinks = relative2absolute(rawLinks, url, base).filter(function(e){
var uri = new URI(e);
if(ROOT_DOMAIN == uri.domain()){
return true;
}
return false;
});
return finalLinks;
}
return [];
}
var TO_CRAWL = [];
var ALL_UNIQUE_URLS = []
var PASRSED_URLS = new BloomFilter();
var PAGE_LIMIT = 150;
var PARSED = 0;
var ROOT_DOMAIN = '';
function async(fn, callback) {
setTimeout(function() {
fn();
callback();
}, 0);
}
function getStatus(){
var data = {}
data['parsed'] = PARSED
data['unique'] = ALL_UNIQUE_URLS.length
data['queue'] = TO_CRAWL.length
return data;
}
function crawl(initialUrl){
var uri = new URI(initialUrl)
ROOT_DOMAIN = uri.domain()
TO_CRAWL = []
ALL_UNIQUE_URLS = []
PASRSED_URLS = new BloomFilter()
PARSED = 0
TO_CRAWL = getLinksFromUrl(initialUrl);
while(ALL_UNIQUE_URLS.length < PAGE_LIMIT && TO_CRAWL.length > 0){
url = TO_CRAWL.pop();
var links = getLinksFromUrl(url)
// TODO Add extra check for 404 and other error codes
// Mark current URL as crawled
PARSED++;
PASRSED_URLS.add(url);
ALL_UNIQUE_URLS.push(url);
// Add new URls in queue
links = links.filter(function(e){
// TODO check domain here
if(!PASRSED_URLS.check(e)){
return true;
}
return false;
});
TO_CRAWL.push.apply(TO_CRAWL, links);
}
}
chrome.extension.onConnect.addListener(function(port) {
port.onMessage.addListener(function(data) {
if (data['command'] != undefined && data['command'] == 'start_crawl'){
async(
function(){
crawl(data['url']);
},
function(){
console.log(ALL_UNIQUE_URLS);
}
);
// SECOND METHOD TO SEND DATA AT POPUP
var interval = setInterval(function(){
port.postMessage(JSON.stringify(getStatus()));
}, 1000);
console.log('started')
port.postMessage('crawling started');
}
// FIRST METHOD TO SEND DATA AT POPUP
if (data['command'] != undefined && data['command'] == 'status'){
port.postMessage(JSON.stringify(getStatus()));
}
});
});