我必须从R igraph中导出graphml file以手动添加列值。当我想再次导入graphml文件时,它必须是正确的UTF-8和有效的xml。所以我在使用iconv()
保存到UTF-8之前转换数据,如下面代码的for循环中所示
library(igraph)
edges <- read.csv2("https://www.dropbox.com/s/p8e7hcck0d4nnrp/Subgraph_nowvalid.graphml?dl=0", header=TRUE, quote="");
amount <- nrow(edges);
amount;
sources <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));
targets <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));
for (i in 1:ncol(edges)) {
edges[,i] <- iconv(edges[,i], to="UTF-8", sub="");
if (is.character(edges[,i])) {
edges[,i] <- gsub("[[:cntrl:]]", "", edges[,i])
}
}
sources[,1] <- edges[,1];
sources[,2:8] <- NA;
sources[,9] <- edges[,4];
sources[,10:13] <- NA;
targets[,1] <- edges[,2];
targets[,2] <- edges[,7];
targets[,3] <- edges[,8];
targets[,4] <- edges[,9];
targets[,5] <- edges[,10];
targets[,6] <- edges[,11];
targets[,7] <- edges[,12];
targets[,8] <- edges[,13];
targets[,9:13] <- NA;
print("REPORT: vertices data frames filled")
sources <- unique(sources);
targets <- unique(targets);
print("REPORT: Duplicated sources and targets removed");
nodes <- within(merge(sources, targets, by="Vertexname", all=TRUE), {
Description <- ifelse(is.na(Description.x), paste(Description.y), Description.x); Description.x = NULL; Description.y = NULL;
Follower <- ifelse(is.na(Follower.x), Follower.y, Follower.x); Follower.x = NULL; Follower.y = NULL;
Friends <- ifelse(is.na(Friends.x), Friends.y, Friends.x); Friends.x = NULL; Friends.y = NULL;
Favourites <- ifelse(is.na(Favourites.x), Favourites.y, Favourites.x); Favourites.x = NULL; Favourites.y = NULL;
Statuses <- ifelse(is.na(Statuses.x), Statuses.y, Statuses.x); Statuses.x = NULL; Statuses.y = NULL;
ProfileAge <- ifelse(is.na(ProfileAge.x), ProfileAge.y, ProfileAge.x); ProfileAge.x = NULL; ProfileAge.y = NULL;
Listed <- ifelse(is.na(Listed.x), Listed.y, Listed.x); Listed.x = NULL; Listed.y = NULL;
Timestamp <- ifelse(is.na(Timestamp.y), Timestamp.x, Timestamp.y); Timestamp.x = NULL; Timestamp.y = NULL;
OutDegree <- ifelse(is.na(OutDegree.x), OutDegree.y, OutDegree.x); OutDegree.x = NULL; OutDegree.y = NULL;
InDegree <- ifelse(is.na(InDegree.x), InDegree.y, InDegree.x); InDegree.x = NULL; InDegree.y = NULL;
WOutDegree <- ifelse(is.na(WOutDegree.x), WOutDegree.y, WOutDegree.x); WOutDegree.x = NULL; WOutDegree.y = NULL;
WInDegree <- ifelse(is.na(WInDegree.x), WInDegree.y, WInDegree.x); WInDegree.x = NULL; WInDegree.y = NULL});
print("REPORT: Sources and Targets merged");
nodes <- subset(nodes, !duplicated(nodes$Vertexname));
print("REPORT: Duplicated vertices removed");
nrow(nodes);
edges <- edges[complete.cases(edges[,1:2]),];
nodes <- nodes[complete.cases(nodes[,1]),];
print("REPORT: Invalid edges and nodes removed");
g <- graph.data.frame(edges, directed=TRUE, nodes);
print("REPORT: Graph created");
outdegrees <- degree(g, v=V(g), mode="out");
indegrees <- degree(g, v=V(g), mode="in");
woutdegrees <- graph.strength(g, v=V(g), mode="out");
windegrees <- graph.strength(g, v=V(g), mode="in");
g <- set.vertex.attribute(g, "OutDegree", V(g), outdegrees);
g <- set.vertex.attribute(g, "InDegree", V(g), indegrees);
g <- set.vertex.attribute(g, "WOutDegree", V(g), woutdegrees);
g <- set.vertex.attribute(g, "WInDegree", V(g), windegrees);
print("REPORT: Degree calculated and added as vertex attribute");
# Filter
nodes <- get.data.frame(g, "vertices");
nodes <- nodes[order(nodes$OutDegree, decreasing = TRUE),];
nrow(nodes);
minOutDegree <- nodes[1335,"OutDegree"]; # 1335
minOutDegree;
nodes <- nodes[order(nodes$InDegree, decreasing = TRUE),];
minInDegree <- nodes[1335,"InDegree"];
minInDegree;
nodes2 <- subset(nodes, nodes$OutDegree >= minOutDegree | nodes$InDegree >= minInDegree);
nrow(nodes2);
nodes3 <- subset(nodes, nodes$OutDegree >= minOutDegree & nodes$InDegree >= minInDegree);
nrow(nodes3);
g <- set.vertex.attribute(g, "Group", V(g), NA);
g <- induced.subgraph(g, V(g)$OutDegree >= minOutDegree | V(g)$InDegree >= minInDegree);
length(E(g));
length(V(g));
g <- induced.subgraph(g, V(g)$OutDegree > 0 & V(g)$InDegree > 0);
length(E(g));
length(V(g));
g <- induced.subgraph(g, V(g)$OutDegree > (V(g)$InDegree / 3));
length(E(g));
length(V(g));
write.graph(g, "SomePath");
print("REPORT: Subgraph Test saved");
当我再次使用read.graph
导入graphml文件时,出现错误:
Error in .Call("R_igraph_read_graph_graphml", file, as.numeric(index), :
At foreign-graphml.c:1202 :
å
, Parse error
因此,我使用XMLValidatorBuddy验证graphml文件(在下拉字段中选择UTF-8作为使用的编码,但无论选择哪种编码,都会发生错误)。这是我收到错误的地方:
无效字节&#39;?&#39;在2字节序列的第2位
根据XMLValidatorBuddy,错误发生在第4278行。
this问题的答案对我没有帮助,因为由于R中的转换,我应该有一个UTF-8编码的graphml文件。
答案 0 :(得分:0)
这一行绝对不正确:
edges[,i] <- gsub("[[:ctrl:]]", "", edges[,i])
我发现它的目的是从边缘属性中删除任何不允许的控制字符,这样您就不会对GraphML编写器造成任何麻烦,但[[:ctrl:]]
应该是[[:cntrl:]]
}。 (实际上,我的R版本在看到[[:ctrl:]]
时会抱怨,但也许你的版本没有。
另外,在将其转换为UTF-8后,我会避免弄乱字符串的各个字符。如果要从字符串中删除控制字符,请在转换为UTF-8之前执行此操作。由于UTF-8编码的工作原理,字符代码小于128的Unicode字符(包含您担心的所有控制字符)保持不变,并且UTF-8编码不会引入任何其他ASCII字符字符代码小于128。