Question

这是我第一次尝试spacy。我有以下形式的临时培训数据。

[
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"Michael",
                "tag":"-",
                "ner":"U-PER"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"Irwin",
                "tag":"-",
                "ner":"U-PER"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"Jordan",
                "tag":"-",
                "ner":"U-PER"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"is",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"an",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"American",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"scientist",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"Professor",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"at",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"the",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"University",
                "tag":"-",
                "ner":"U-ORG"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"of",
                "tag":"-",
                "ner":"U-ORG"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"California",
                "tag":"-",
                "ner":"U-ORG"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"Berkeley",
                "tag":"-",
                "ner":"U-LOC"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"and",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"a",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"researcher",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"in",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"machine",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"learning",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"statistics",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"and",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"artificial",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"intelligence",
                "tag":"-",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "id":0,
    "paragraphs":[
      {
        "sentences":[
          {
            "tokens":[
              {
                "orth":"",
                "tag":"",
                "ner":"O"
              }
            ]
          }
        ]
      }
    ]
  }
]

我所见过的所有训练spacy模型（https://spacy.io/usage/training#spacy-train-cli）的示例都可以使用以下类型的输入

可以请一个例子来训练第一种形式的智能输入

Answer 1

我最近更新了IOB / NER转换器，并创建了spacy convert -c iob接受的一组示例输入，并以这种格式输出了相应的训练数据：

https://github.com/explosion/spaCy/tree/8ebc3711dc1ec065c39aeb6017d9ace129a28d3f/examples/training/ner_example_data

更新的转换器将在下一个版本中发布，但是如果您想尽快尝试，则可以从源代码安装master分支。

使用自定义输入来训练伪造模型

1 个答案: