Question

我正在尝试使用一个csv文件制作一个图形，该文件具有有关节点的边缘，专业和年龄的信息。我为每个节点分配社区并执行链接预测。

import networkx as nx
import csv
engineers1 = []
engineers2 = []
engineers3 = []
engineers4 = []
engineers5 = []
actors1= []
actors2= []
actors3= []
actors4= []
actors5= []
writers1 = []
writers2= []
writers3= []
writers4 = []
writers5 = []
doctors1= []
doctors2= []
doctors3= []
doctors4= []
doctors5= []
drivers1=[]
drivers2=[]
drivers3=[]
drivers4=[]
drivers5=[]
teachers1=[]
teachers2=[]
teachers3=[]
teachers4=[]
teachers5=[]
nodes=[]
g=nx.Graph()

for i in range(0,4038):
    g.add_node(i)

with open("asd1.csv",'r') as csv_file:
    csv_reader=csv.DictReader(csv_file)

    for line in csv_reader:
        g.add_edge(line['first'],line['second'])

csv_file.close()

with open("asd1.csv",'r') as csv_file:
    csv_reader=csv.DictReader(csv_file)
    for line in csv_reader:
         if (line['profession'] == 'actor' and line['age'] >= '13' and 
line['age'] <= '17'):
            actors1.append(line['name'])
        if (line['profession'] == 'actor' and line['age'] >= '18' and 
line['age'] <= '29'):
          actors2.append(line['name'])
        if (line['profession'] == 'actor' and line['age'] >= '30' and 
line['age'] <= '49'):
        actors3.append(line['name'])
    if (line['profession'] == 'actor' and line['age'] >= '50' and line['age'] <= '64'):
        actors4.append(line['name'])
    if (line['profession'] == 'actor' and line['age'] >= '65'):
        actors5.append(line['name'])

    if (line['profession'] == 'eng' and line['age'] >= '13' and line['age'] <= '17'):
        engineers1.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '18' and line['age'] <= '29'):
        engineers2.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '30' and line['age'] <= '49'):
        engineers3.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '50' and line['age'] <= '64'):
        engineers4.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '65'):
        engineers5.append(line['name'])

    if (line['profession'] == 'teacher' and line['age'] >= '13' and line['age'] <= '17'):
        teachers1.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '18' and line['age'] <= '29'):
        teachers2.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '30' and line['age'] <= '49'):
        teachers3.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '50' and line['age'] <= '64'):
        teachers4.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '65'):
        teachers5.append(line['name'])

    if (line['profession'] == 'driver' and line['age'] >= '13' and line['age'] <= '17'):
        drivers1.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '18' and line['age'] <= '29'):
        drivers2.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '30' and line['age'] <= '49'):
        drivers3.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '50' and line['age'] <= '64'):
        doctors4.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '65'):
        drivers5.append(line['name'])

    if (line['profession'] == 'doctor' and line['age'] >= '13' and line['age'] <= '17'):
        doctors1.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '18' and line['age'] <= '29'):
        doctors2.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '30' and line['age'] <= '49'):
        doctors3.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '50' and line['age'] <= '64'):
        drivers4.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '65'):
        doctors5.append(line['name'])

csv_file.close()

print("actors having age between 13 and 17: ",actors1) 
print("actors having age between 18 and 29: ",actors2)
print("actors having age between 30 and 49: ",actors3) 
print("actors having age between 50 and 64: ",actors4)
print("actors having age 65 and above: ",actors5)
print('\n')

print("engineers having age between 13 and 17: ",engineers1)
print("engineers having age between 18 and 29: ",engineers2)
print("engineers having age between 30 and 49: ",engineers3)
print("engineers having age between 50 and 64: ",engineers4)
print("engineers having age 65 and above: ",engineers5)
print('\n')

print("teachers having age between 13 and 17: ",teachers1)
print("teachers having age between 18 and 29: ",teachers2)
print("teachers having age between 30 and 49: ",teachers3)
print("teachers having age between 50 and 64: ",teachers4)
print("teachers having age 65 and above: ",teachers5)
print('\n')

print("drivers having age between 13 and 17: ",drivers1)
print("drivers having age between 18 and 29: ",drivers2)
print("drivers having age between 30 and 49: ",drivers3)
print("drivers having age between 50 and 64: ",drivers4)
print("drivers having age 65 and above: ",drivers5)
print('\n')

print("doctors having age between 13 and 17: ",doctors1)
print("doctors having age between 18 and 29: ",doctors2)
print("doctors having age between 30 and 49: ",doctors3)
print("doctors having age between 50 and 64: ",doctors4)
print("doctors having age 65 and above: ",doctors5)
print('\n')

for i in range(0,4038):
    g.node[i]['community']=0

for x1 in actors1:
    g.node[x1]['community']=0
for x2 in actors2:
    g.node[x2]['community']=1 
for x3 in actors3:
    g.node[x3]['community']=2
for x4 in actors4:
    g.node[x4]['community']=3
for x5 in actors5:
    g.node[x5]['community']=4
for x6 in engineers1:
    g.node[x6]['community']=5
for x7 in engineers2:
    g.node[x7]['community']=6
for x8 in engineers3:
    g.node[x8]['community']=7
for x9 in engineers4:
    g.node[x9]['community']=8
for x10 in engineers5:
    g.node[x10]['community']=9
for x11 in teachers1:
    g.node[x11]['community']=10
for x12 in teachers2:
    g.node[x12]['community']=11
for x13 in teachers3:
    g.node[x13]['community']=12
for x14 in teachers4:
    g.node[x14]['community']=13
for x15 in teachers5:
    g.node[x15]['community']=14
for x16 in drivers1:
    g.node[x16]['community']=15
for x17 in drivers2:
    g.node[x17]['community']=16
for x18 in drivers3:
    g.node[x18]['community']=17
for x19 in drivers4:
    g.node[x19]['community']=18
for x20 in drivers5:
    g.node[x20]['community']=19
for x21 in doctors1:
    g.node[x21]['community']=20
for x22 in doctors2:
   g.node[x22]['community']=21
for x23 in doctors3:
    g.node[x23]['community']=22
for x24 in doctors4:
    g.node[x24]['community']=23
for x25 in doctors5:
    g.node[x25]['community']=24

print(g.nodes())
l=list(nx.cn_soundarajan_hopcroft(g))
print(l)

Answer 1

序言

我高度建议您阅读任何解释算法的优秀编程书籍。您的问题可以用几行代码来解决。

行为1

看看你的问题。您有多个专业，多个年龄组和多个名称作为唯一标识符。您想彼此区别。现在看一下您的代码。为了解决您的问题，您正在为每个年龄-行业组合创建唯一列表。它是可以创建的最少修改的结构。如果您将不得不添加另外五个职业（有成千上万个不同的职业），那么您就必须从字面上将代码加倍。此外，复制粘贴时很容易出错。仅用普通的merchandiser3代替merchandiser4就可以使您接下来的两小时变成红眼。看，您的代码中已经有错误！

if (line['profession'] == 'doctor' and line['age'] >= '13' and line['age'] <= '17'):
    doctors1.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '18' and line['age'] <= '29'):
    doctors2.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '30' and line['age'] <= '49'):
    doctors3.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '50' and line['age'] <= '64'):
    # Hello, guys! I am ready to torture his brain and eyes for hours!!
    drivers4.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '65'):
    doctors5.append(line['name'])

而且，作为最后的镜头，您实际上并不需要所有这些列表。例如，您可以为每个专业创建一个字典。或者是其他东西。但是您可以注意到，您的数据对每个人都有非常频繁的重复模式。名称，年龄，职业...等等，我们从哪里获取数据？ CSV文件？而什么是CSV文件？

是的

表。

第二幕

如果您从表中读取数据，则最好将此数据存储在表中！（嗯，大多数时间……）Python有一个很棒的表库-Pandas。您的所有数百行可以减少到一打十二！现在，仔细看我的手，魔法开始...

零。我们进口熊猫：

import pandas as pd

首先。我们为年龄聚类创建单独的功能。如果我们的大老板说我们要处理11岁的神经科学家，我们将完全准备就绪：

def get_age_cluster(age):
    a = int(age)
    if a >= 0 and a <= 12:
        return '<13'
    if a >= 13 and a <= 17:
        return '13-17'
    if a >= 18 and a <= 29:
        return '18-29'
    if a >= 30 and a <= 49:
        return '30-49'
    if a >= 50 and a <= 64:
        return '50-64'
    elif a >= 65:
        return '>64'

第二。我们阅读了CSV。您正在手动操作，逐行，处理每种可能的组合……为什么？！这是一种常见的操作！人们早就写了它！懒惰！

（这是我多年的老老师的建议，多年来我一直存储在我的心中！笑话。我没有心。）

df=pd.read_csv('TF.csv')

是的，仅此而已。是。真。一条线。二十四个符号（记住这个数字！）。现在，让我们与我们的十个小美人成为朋友：

我们刚刚加载了CSV，但没有转换age列。它包含年龄，但应包含群集。没问题！

df['age'] = df['age'].apply(get_age_cluster)

完成！您可以将任何转换函数应用于表中的行或列。因此，我们不需要分类年龄，分类年龄和分类时间，分类aegs和…。我们可以编写一个漂亮的单线。结果如下：

您可以注意到我们有一些垃圾专栏。没问题！

df = df.drop('waka', axis=1) df = df.drop('we_dont_need_this_column', axis=1)

我们有一张漂亮的小桌子：

现在是主要任务。根据每个职业和年龄获取所有名称。熊猫具有许多分组功能。让我们使用最简单的方法：

grouped = df.groupby(['profession', 'age'])
for group in grouped.groups:
    print(group, list(grouped.get_group(group)['name']))

我们得到具有专业年龄组的分组结构：grouped = df.groupby(['profession', 'age'])，并且对于该结构中的每个组：for group in grouped.groups:，我们打印：print()中的“名称”列的列表每组：grouped.get_group(group)['name'])。结果如下：

('eng', '30-49') ['Cthulhu']
('driver', '18-29') ['John Doe 3']
('actor', '13-17') ['John Doe 4']
('actor', '18-29') ['Yog-Sothoth']
('teacher', '18-29') ['John Doe 2', 'Shub-Niggurath']
('eng', '>64') ['Fblthp the Lost']
('driver', '<13') ['Azathoth']
('doctor', '18-29') ['Nyarlathotep']
('doctor', '30-49') ['John Doe 1']

这是完整的代码：

import pandas as pd

def get_age_cluster(age):
    a = int(age)
    if a >= 0 and a <= 12:
        return '<13'
    if a >= 13 and a <= 17:
        return '13-17'
    if a >= 18 and a <= 29:
        return '18-29'
    if a >= 30 and a <= 49:
        return '30-49'
    if a >= 50 and a <= 64:
        return '50-64'
    elif a >= 65:
        return '>64'

df=pd.read_csv('TF.csv')
df['age'] = df['age'].apply(get_age_cluster)
df = df.drop('waka', axis=1)
df = df.drop('we_dont_need_this_column', axis=1)
grouped = df.groupby(['profession', 'age'])
for group in grouped.groups:
    print(group, list(grouped.get_group(group)['name']))

二十四行。我认为我们现在可以称自己为“神奇的二十四岁”。它就像神奇四侠，但神奇的二十四。但是我们的Graph Doom还活着...

行为3

我们创建了表格，进行了一些转换，排序和过滤。但是您还有另一个问题-图形。而且这个问题比第一个困难。

您正在从一个文件中读取节点（人类）和边缘（我不知道确切的关系。）。它迫使您的图具有严格的限制-节点数等于边数。这是非常罕见的情况。我认为在开始编写此脚本之前，您做错了什么。我建议您为节点和边使用不同的文件（或至少一个文件中的不同部分）。但！假设您正在做自己想要的事情，并且每个人（当然还有克苏鲁！）只有一条优势。在这种情况下，我们可以仅用两行代码来构建图形：

G = nx.Graph()
G.add_edges_from(df[['first', 'second']].values)

宾果！我们完了。现在让我们得到这个奇怪的复杂事情：

设置每个节点的社区（请注意，算法需要它）：

for n in G.nodes:
    G.nodes[n]['community'] = 0

并计算此：

csh = nx.cn_soundarajan_hopcroft(G)

我们得到一个迭代器。将其转换为列表并获取结果：

[(1, 8, 2),
 (1, 9, 0),
 (1, 2, 4),
 (1, 4, 0),
 (1, 6, 2),
 (2, 8, 2),
 (2, 9, 2),
 (2, 5, 0),
 (2, 6, 2),
 (3, 9, 0),
 (3, 4, 2),
 (3, 5, 2),
 (3, 6, 0),
 (3, 7, 4),
 (4, 8, 0),
 (4, 5, 2),
 (4, 7, 2),
 (5, 8, 0),
 (5, 9, 0),
 (5, 7, 2),
 (6, 8, 0),
 (6, 9, 2),
 (6, 7, 0),
 (7, 8, 0),
 (7, 9, 0),
 (8, 9, 0)]

总决赛

希望您喜欢我为您写的音乐小作品：）我推荐，您要写一些不错的Python编程书和算法编程书。祝你好运！

使用python中的networkx创建图并执行链接预测时出错

1 个答案: