我目前正在开展一个项目,我需要从dblp上发布的文章中获取作者的联系。 因此,我设置了一个翻译服务器,您可以在他们的github中获取翻译服务器,并遵循其他说明。
然后我在我的Java程序中建立一个连接,如下所示:
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import org.json.*;
public class ZoteroHandler
{
//Function runing the scan
public static void Scan(Article article) throws Exception
{
//Setting up an URL HttpURLConnection given DOI
URL urlDoi = new URL (article.GetElectronicEdition());
HttpURLConnection connDoi = (HttpURLConnection) urlDoi.openConnection();
// Make the logic below easier to detect redirections
connDoi.setInstanceFollowRedirects(false);
String doi = "{\"url\"Smiley unsure"" + connDoi.getHeaderField("Location") + "\",\"sessionid\"Smiley unsure"abc123\"}";
//Setting up an URL to translation-server
URL url = new URL("http://127.0.0.1:1969/web");
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
conn.setRequestProperty("Content-Type", "application/json");
OutputStreamWriter writer = new OutputStreamWriter(conn.getOutputStream());
writer.write(doi);
writer.flush();
String line;
BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
while ((line = reader.readLine()) != null )
{
//Used to see of we get something from stream
System.out.println(line);
//Incoming is JSONArray, so create new array, fill it then parse it
JSONArray jsonArr = new JSONArray(line);
JSONObject obj = jsonArr.getJSONObject(0);
//Getting abstracts
String abstracts = obj.getString("abstractNote");
System.out.println(abstracts);
//Setting information in db
article.SetAbstracts(abstracts);
DatabaseHandler.GetInstance().UpdateArticle(article);
}
writer.close();
reader.close();
//Need to disconnect?
//((HttpURLConnection) conn).disconnect();
//connDoi.disconnect();
}
到目前为止一切顺利。我获取了我想要的信息并将其存储在摘要字符串中并将其设置在数据库中。 但现在我还需要获得作者联盟。所以我需要以某种方式修改我使用的翻译脚本。
这是剧本:
{
"translatorID": "5af42734-7cd5-4c69-97fc-bc406999bdba",
"label": "Atypon Journals",
"creator": "Sebastian Karcher",
"target": "^https?://[^?#]+(?:/doi/((?:abs|abstract|full|figure|ref|citedby|book)/)?10\\.|/action/doSearch\\?)|^https?://[^/]+/toc/",
"minVersion": "3.0",
"maxVersion": "",
"priority": 270,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2015-10-15 22:24:05"
}
/*
Atypon Journals Translator
Copyright (C) 2011-2014 Sebastian Karcher
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
function detectWeb(doc, url)
{
if (url.search(/^https?:\/\/[^\/]+\/toc\/|\/action\/doSearch\?/) != -1)
{
return getSearchResults(doc, true) ? "multiple" : false;
}
var citLinks = ZU.xpath(doc, '//a[contains(@href, "/action/showCitFormats")]');
if (citLinks.length > 0) {
if (url.indexOf('/doi/book/') != -1) {
return 'book';
}
else if (url.search(/\.ch\d+$/)!=-1){
return 'bookSection';
}
return "journalArticle";
}
}
function getSearchResults(doc, checkOnly, extras) {
var articles = {};
var container = doc.getElementsByName('frmSearchResults')[0]
|| doc.getElementsByName('frmAbs')[0];
if (!container) {
Z.debug('Atypon: multiples container not found.');
return false;
}
var rows = container.getElementsByClassName('articleEntry'),
found = false,
doiLink = 'a[contains(@href, "/doi/abs/") or contains(@href, "/doi/abstract/") or '
+ 'contains(@href, "/doi/full/") or contains(@href, "/doi/book/")]';
for (var i = 0; i<rows.length; i++) {
var title = rows[i].getElementsByClassName('art_title')[0];
if (!title) continue;
title = ZU.trimInternal(title.textContent);
var urlRow = rows[i];
var url = ZU.xpathText(urlRow, '(.//' + doiLink + ')[1]/@href');
if (!url) {
// e.g. http://pubs.rsna.org/toc/radiographics/toc/33/7 shows links in adjacent div
urlRow = rows[i].nextElementSibling;
if (!urlRow || urlRow.classList.contains('articleEntry')) continue;
url = ZU.xpathText(urlRow, '(.//' + doiLink + ')[1]/@href');
}
if (!url) continue;
if (checkOnly) return true;
found = true;
if (extras) {
extras[url] = { pdf: buildPdfUrl(url, urlRow) };
}
articles[url] = title;
}
if (!found){
Z.debug("Trying an alternate multiple format");
var rows = container.getElementsByClassName("item-details");
for (var i = 0; i<rows.length; i++) {
var title = ZU.xpathText(rows[i], './h3');
if (!title) continue;
title = ZU.trimInternal(title);
var url = ZU.xpathText(rows[i], '(.//ul[contains(@class, "icon-list")]/li/'
+ doiLink + ')[1]/@href');
if (!url) continue;
if (checkOnly) return true;
found = true;
if (extras) {
extras[url] = { pdf: buildPdfUrl(url, rows[i]) };
}
articles[url] = title;
}
}
return found ? articles : false;
}
// Keep this in line with target regexp
var replURLRegExp = /\/doi\/((?:abs|abstract|full|figure|ref|citedby|book)\/)?/;
function buildPdfUrl(url, root) {
if (!replURLRegExp.test(url)) return false; // The whole thing is probably going to fail anyway
var pdfPaths = ['/doi/pdf/', '/doi/pdfplus/'];
for (var i=0; i<pdfPaths.length; i++) {
if (ZU.xpath(root, './/a[contains(@href, "' + pdfPaths[i] + '")]').length) {
return url.replace(replURLRegExp, pdfPaths[i]);
}
}
Z.debug('PDF link not found.')
if (root.nodeType != 9 /*DOCUMENT_NODE*/) {
Z.debug('Available links:');
var links = root.getElementsByTagName('a');
if (!links.length) Z.debug('No links');
for (var i=0; i<links.length; i++) {
Z.debug(links[i].href);
}
}
return false;
}
function doWeb(doc, url) {
if (detectWeb(doc, url) == "multiple") {
var extras = {};
Zotero.selectItems(getSearchResults(doc, false, extras), function (items) {
if (!items) {
return true;
}
var articles = [];
for (var itemurl in items) {
articles.push({
url: itemurl.replace(/\?prev.+/, ""),
extras: extras[itemurl]
});
}
fetchArticles(articles);
});
} else {
scrape(doc, url, {pdf: buildPdfUrl(url, doc)});
}
}
function fixCase(str, titleCase) {
if (str.toUpperCase() != str) return str;
if (titleCase) {
return ZU.capitalizeTitle(str, true);
}
return str.charAt(0) + str.substr(1).toLowerCase();
}
function fetchArticles(articles) {
if (!articles.length) return;
var article = articles.shift();
ZU.processDocuments(article.url, function(doc, url) {
scrape(doc, url, article.extras);
},
function() {
if (articles.length) fetchArticles(articles);
});
}
function scrape(doc, url, extras) {
url = url.replace(/[?#].*/, "");
var doi = url.match(/10\.[^?#]+/)[0];
var citationurl = url.replace(replURLRegExp, "/action/showCitFormats?doi=");
var abstract = doc.getElementsByClassName('abstractSection')[0];
//var authorAffiliation = doc.getElementsByClassName('listGroup')[0];
var tags = ZU.xpath(doc, '//p[@class="fulltext"]//a[contains(@href, "keyword") or contains(@href, "Keyword=")]');
Z.debug("Citation URL: " + citationurl);
ZU.processDocuments(citationurl, function(citationDoc){
var filename = citationDoc.evaluate('//form//input[@name="downloadFileName"]', citationDoc, null, XPathResult.ANY_TYPE, null).iterateNext().value;
Z.debug("Filename: " + filename);
var get = '/action/downloadCitation';
var post = 'doi=' + doi + '&downloadFileName=' + filename + '&format=ris&direct=true&include=cit';
ZU.doPost(get, post, function (text)
{
//Z.debug(text);
var translator = Zotero.loadTranslator("import");
// Calling the RIS translator
translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
translator.setString(text);
translator.setHandler("itemDone", function (obj, item)
{
// Sometimes we get titles and authros in all caps
item.title = fixCase(item.title);
for (var i=0; i<item.creators.length; i++)
{
item.creators[i].lastName = fixCase(item.creators[i].lastName, true);
if (item.creators[i].firstName) {
item.creators[i].firstName = fixCase(item.creators[i].firstName, true);
}
}
item.url = url;
//for Emerald, get rid of the "null" that they add at the end of every title:
if (url.indexOf("www.emeraldinsight.com")!=-1){
item.title = item.title.replace(/null$/, "")
}
item.notes = [];
for (var i in tags)
{
item.tags.push(tags[i].textContent)
}
if (abstract)
{
// Drop "Abstract" prefix
// This is not excellent, since some abstracts could
// conceivably begin with the word "abstract"
item.abstractNote = abstract.textContent
.replace(/^\s*abstract\s*/i, '');
}
item.attachments = [];
if (extras.pdf) {
item.attachments.push({
url: extras.pdf,
title: "Full Text PDF",
mimeType: "application/pdf"
});
}
item.attachments.push({
document: doc,
title: "Snapshot",
mimeType: "text/html"
});
item.libraryCatalog = url.replace(/^https?:\/\/(?:www\.)?/, '')
.replace(/[\/:].*/, '') + " (Atypon)";
item.complete();
});
translator.translate();
});
})
}
那么是否有人可以告诉我如何更新脚本以便我可以获得作者联盟? 我知道该脚本应该转到HTML-class&#34; ListGroup&#34;找到auhtor隶属关系。
答案 0 :(得分:0)
我通过这样做解决了这个问题:
function scrape(doc, url, extras) {
url = url.replace(/[?#].*/, "");
var doi = url.match(/10\.[^?#]+/)[0];
var citationurl = url.replace(replURLRegExp, "/action/showCitFormats?doi=");
//TESTING
var affiliations = [];
var affiliation = doc.getElementsByClassName('listGroup');
var abstract = doc.getElementsByClassName('abstractSection')[0];
var tags = ZU.xpath(doc, '//p[@class="fulltext"]//a[contains(@href, "keyword") or contains(@href, "Keyword=")]');
Z.debug("Citation URL: " + citationurl);
ZU.processDocuments(citationurl, function(citationDoc){
var filename = citationDoc.evaluate('//form//input[@name="downloadFileName"]', citationDoc, null, XPathResult.ANY_TYPE, null).iterateNext().value;
Z.debug("Filename: " + filename);
var get = '/action/downloadCitation';
var post = 'doi=' + doi + '&downloadFileName=' + filename + '&format=ris&direct=true&include=cit';
ZU.doPost(get, post, function (text) {
//Z.debug(text);
var translator = Zotero.loadTranslator("import");
// Calling the RIS translator
translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
translator.setString(text);
translator.setHandler("itemDone", function (obj, item) {
// Sometimes we get titles and authros in all caps
item.title = fixCase(item.title);
for (var i=0; i<item.creators.length; i++) {
item.creators[i].lastName = fixCase(item.creators[i].lastName, true);
if (item.creators[i].firstName) {
item.creators[i].firstName = fixCase(item.creators[i].firstName, true);
}
}
item.url = url;
//for Emerald, get rid of the "null" that they add at the end of every title:
if (url.indexOf("www.emeraldinsight.com")!=-1){
item.title = item.title.replace(/null$/, "")
}
item.notes = [];
for (var i in tags){
item.tags.push(tags[i].textContent)
}
if (abstract) {
// Drop "Abstract" prefix
// This is not excellent, since some abstracts could
// conceivably begin with the word "abstract"
item.abstractNote = abstract.textContent
.replace(/^\s*abstract\s*/i, '');
}
item.attachments = [];
if (extras.pdf) {
item.attachments.push({
url: extras.pdf,
title: "Full Text PDF",
mimeType: "application/pdf"
});
}
item.attachments.push({
document: doc,
title: "Snapshot",
mimeType: "text/html"
});
item.libraryCatalog = url.replace(/^https?:\/\/(?:www\.)?/, '')
.replace(/[\/:].*/, '') + " (Atypon)";
//Affiliations
for (i=0; i<affiliations.length; i++)
{
affiliation.push(affiliations[i].textContent)
}
item.extra = affiliation.join("; ");
item.complete();
});
translator.translate();
});
})
我创建了一个名为affiliations的数组和一个名为affiliation的变量。 然后我用我得到的字符串填充数组并将其存储在Zotero的一个名为extra的字段中,这是因为Zotero没有专门的字段用于作者affiliaton。所以这只是一个黑客,所以我可以加入我的程序