如何让Google DLP V2 API以与V2Beta相同的准确度返回数据?

时间:2018-05-19 00:15:09

标签: google-cloud-dlp

我已将我的Java应用程序从API的V2Beta版本移植到V2,我的结果似乎不如V2Beta版本那么“准确”。

名称,地址,邮政编码,年龄等根本不会被取消识别。我在V2 API中看到的结果与我使用V2Beta API的结果非常不同。也许我做错了什么?给定输入"Hello Mr. John S. Smith! This is Mr. Jones writing back with my SSN: 911-87-9111",唯一被去识别的是SSN数字。我原本预计这些名字也会被删除。

我正在使用Spring注入诸如凭证之类的内容,并且有一些Lombok注释可以简化我的生活,但是大部分代码应该非常简单:

import com.google.api.gax.core.CredentialsProvider;
import com.google.cloud.ProjectName;
import com.google.cloud.dlp.v2.DlpServiceClient;
import com.google.cloud.dlp.v2.DlpServiceSettings;
import com.google.privacy.dlp.v2.CharacterMaskConfig;
import com.google.privacy.dlp.v2.ContentItem;
import com.google.privacy.dlp.v2.DeidentifyConfig;
import com.google.privacy.dlp.v2.DeidentifyContentRequest;
import com.google.privacy.dlp.v2.DeidentifyContentResponse;
import com.google.privacy.dlp.v2.FieldId;
import com.google.privacy.dlp.v2.InfoTypeTransformations;
import com.google.privacy.dlp.v2.InfoTypeTransformations.InfoTypeTransformation;
import com.google.privacy.dlp.v2.PrimitiveTransformation;
import com.google.privacy.dlp.v2.Table;
import com.google.privacy.dlp.v2.Table.Row;
import com.google.privacy.dlp.v2.Value;
import lombok.AccessLevel;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;

import java.util.Collection;
import java.util.LinkedList;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.apache.commons.lang3.StringUtils.isNotBlank;
import static org.springframework.util.CollectionUtils.isEmpty;

@Service("DeIdentifyTest")
@FieldDefaults(level = AccessLevel.PRIVATE)
@Setter
@Slf4j
public class DeIdentifyTest {
    final DlpServiceSettings settings;
    final String projectId;

    @SneakyThrows
    public DeIdentifyTest(CredentialsProvider credentialsProvider, String projectId) {
        this.settings = DlpServiceSettings.newBuilder().setCredentialsProvider(credentialsProvider).build();
        this.projectId = projectId;
    }

    public CompletableFuture<Collection<String>> redact(final Collection<String> input,
                                                            final String mask) {
        return CompletableFuture.supplyAsync(() -> redactContent(input, mask));
    }

    @SneakyThrows
    private Collection<String> redactContent(Collection<String> input, String mask) {
        log.debug("Input: {}", input);

        if (isEmpty(input)) {
            return input;
        }

        CharacterMaskConfig characterMaskConfig =
                CharacterMaskConfig.newBuilder().setMaskingCharacter(mask).build();

        PrimitiveTransformation primitiveTransformation =
                PrimitiveTransformation.newBuilder().setCharacterMaskConfig(characterMaskConfig).build();

        InfoTypeTransformation infoTypeTransformationObject =
                InfoTypeTransformation.newBuilder().setPrimitiveTransformation(primitiveTransformation).build();

        InfoTypeTransformations infoTypeTransformationArray =
                InfoTypeTransformations.newBuilder().addTransformations(infoTypeTransformationObject).build();

        DeidentifyConfig deidentifyConfig =
                DeidentifyConfig.newBuilder().setInfoTypeTransformations(infoTypeTransformationArray).build();

        try (DlpServiceClient dlpClient = DlpServiceClient.create(settings)) {
            // Create the deidentification request object
            DeidentifyContentRequest request =
                    DeidentifyContentRequest.newBuilder()
                            .setParent(ProjectName.of(projectId).toString())
                            .setDeidentifyConfig(deidentifyConfig)
                            .setItem(createContentItemWithTable(input))
                            .build();

            // Execute the deidentification request
            DeidentifyContentResponse response = dlpClient.deidentifyContent(request);
            Table table = response.getItem().getTable();

            return Stream.of(table.getRowsList())
                            .flatMap(rows -> rows.stream())
                            .flatMap(row -> row.getValuesList().stream())
                            .map(val -> val.getStringValue())
                            .collect(Collectors.toCollection(LinkedList::new));
        }
    }

    private ContentItem createContentItemWithTable(Collection<String> input) {
        Table.Builder tableBuilder = Table.newBuilder().addHeaders(FieldId.newBuilder().setName("unused").build());
        Value.Builder valueBuilder = Value.newBuilder();

        Optional<Table.Builder> tableOpt = input.stream()
                .filter(item -> isNotBlank(item))
                .map(item -> valueBuilder.setStringValue(item).build())
                .map(value -> Row.newBuilder().addValues(value).build())
                .map(row -> tableBuilder.addRows(row))
                .reduce((t1, t2) -> t1);

        return ContentItem.newBuilder().setTable(tableOpt.get().build()).build();
    }
}

1 个答案:

答案 0 :(得分:0)

您的示例无法向我们展示您选择检测的InfoType。在V2中发生的主要变化是不再有默认的探测器列表。您必须明确指定要查找的内容。

有关整个列表,请参阅https://cloud.google.com/dlp/docs/infotypes-reference

如果我寄这个

 {
 "item": {
  "value": "Hello Mr. John S. Smith! This is Mr. Jones writing back with my SSN: 509-03-2530"
 },
 "inspectConfig": {
  "includeQuote": true,
  "infoTypes": [
   {
    "name": "PERSON_NAME"
   },
   {
    "name": "US_SOCIAL_SECURITY_NUMBER"
   }
  ]
 }
}

我得到了

{
 "result": {
  "findings": [
   {
    "quote": "Mr. John S. Smith",
    "infoType": {
     "name": "PERSON_NAME"
    },
    "likelihood": "LIKELY",
    "location": {
     "byteRange": {
      "start": "6",
      "end": "23"
     },
     "codepointRange": {
      "start": "6",
      "end": "23"
     }
    },
    "createTime": "2018-05-21T16:11:54.449Z"
   },
   {
    "quote": "Jones",
    "infoType": {
     "name": "PERSON_NAME"
    },
    "likelihood": "POSSIBLE",
    "location": {
     "byteRange": {
      "start": "37",
      "end": "42"
     },
     "codepointRange": {
      "start": "37",
      "end": "42"
     }
    },
    "createTime": "2018-05-21T16:11:54.449Z"
   },
   {
    "quote": "509-03-2530",
    "infoType": {
     "name": "US_SOCIAL_SECURITY_NUMBER"
    },
    "likelihood": "LIKELY",
    "location": {
     "byteRange": {
      "start": "69",
      "end": "80"
     },
     "codepointRange": {
      "start": "69",
      "end": "80"
     }
    },
    "createTime": "2018-05-21T16:11:54.425Z"
   }
  ]
 }
}