这是我第一次尝试spacy。 我有以下形式的临时培训数据。
[
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"Michael",
"tag":"-",
"ner":"U-PER"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"Irwin",
"tag":"-",
"ner":"U-PER"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"Jordan",
"tag":"-",
"ner":"U-PER"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"is",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"an",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"American",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"scientist",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"Professor",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"at",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"the",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"University",
"tag":"-",
"ner":"U-ORG"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"of",
"tag":"-",
"ner":"U-ORG"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"California",
"tag":"-",
"ner":"U-ORG"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"Berkeley",
"tag":"-",
"ner":"U-LOC"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"and",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"a",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"researcher",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"in",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"machine",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"learning",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"statistics",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"and",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"artificial",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"intelligence",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
},
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"",
"tag":"",
"ner":"O"
}
]
}
]
}
]
}
]
我所见过的所有训练spacy模型(https://spacy.io/usage/training#spacy-train-cli)的示例都可以使用以下类型的输入
可以请一个例子来训练第一种形式的智能输入
答案 0 :(得分:1)
我最近更新了IOB / NER转换器,并创建了spacy convert -c iob
接受的一组示例输入,并以这种格式输出了相应的训练数据:
更新的转换器将在下一个版本中发布,但是如果您想尽快尝试,则可以从源代码安装master分支。