How can I find duplicates in a JSON file after parsing it out like the code below? I want to count the number of duplicates in the data where a duplicate would have the first name, last name, and email address all match.
The JSON file is rather huge, so I won't copy and paste it here. But here is a snippet of it:
[
{
"firstName":"Cletus",
"lastName":"Defosses",
"emailAddress":"ea4ad81f-4111-4d8d-8738-ecf857bba992.Defosses@somedomain.org"
},
{
"firstName":"Sherron",
"lastName":"Siverd",
"emailAddress":"51c985c5-381d-4d0e-b5ee-83005f39ce17.Siverd@somedomain.org"
},
{
"firstName":"Garry",
"lastName":"Eirls",
"emailAddress":"cc43c2da-d12c-467f-9318-beb3379f6509.Eirls@somedomain.org"
}]
This is the main.cpp file:
#include <iostream>
#include <string>
#include "Customer.h"
#include "boost\property_tree\ptree.hpp"
#include "boost\property_tree\json_parser.hpp"
#include "boost\foreach.hpp"
using namespace std;
int main()
{
int numOfCustomers;
// parse the JSON file
boost::property_tree::ptree file;
boost::property_tree::read_json("customers.json", file);
cout << "Reading file..." << endl;
numOfCustomers = file.size();
// iterate over each top level entry
BOOST_FOREACH(boost::property_tree::ptree::value_type const& rowPair, file.get_child(""))
{
// rowPair.first == "" and rowPair.second is the subtree with names and emails
// iterate over rows and columns
BOOST_FOREACH(boost::property_tree::ptree::value_type const& itemPair, rowPair.second)
{
// e.g. itemPair.first == "firstName: " or "lastName: "
cout << itemPair.first << ": ";
// e.g. itemPair.second is the actual names and emails
cout << itemPair.second.get_value<std::string>() << endl;
}
cout << endl;
}
cout << endl;
return 0;
}
The Customer class is just a generic class.
class Customer
{
private:
std::string m_firstNme;
std::string m_lastName;
std::string m_emailAddress;
public:
std::string getFirstName();
void setFirstName(std::string firstName);
std::string getLastName();
void setLastName(std::string lastName);
std::string getEmailAddress();
void setEmailAddress(std::string emailAddress);
};
答案 0 :(得分:0)
You'd typically insert the customer objects/keys into a std::set
or std::map
and define a total ordering that spots the duplicates on insertion.
Defining the key function and comparator object:
boost::tuple<string const&, string const&, string const&> key_of(Customer const& c) {
return boost::tie(c.getFirstName(), c.getLastName(), c.getEmailAddress());
}
struct by_key {
bool operator()(Customer const& a, Customer const& b) const {
return key_of(a) < key_of(b);
}
};
Now you can simply insert the objects in a set<Customer, by_key>
:
set<Customer, by_key> unique;
// iterate over each top level array
BOOST_FOREACH(boost::property_tree::ptree::value_type const& rowPair, file.get_child(""))
{
Customer current;
current.setFirstName ( rowPair.second.get ( "firstName", "?" ) ) ;
current.setLastName ( rowPair.second.get ( "lastName", "?" ) ) ;
current.setEmailAddress ( rowPair.second.get ( "emailAddress", "?" ) ) ;
if (unique.insert(current).second)
cout << current << "\n";
else
cout << "(duplicate skipped)\n";
}
I've duplicated 1 entry in your sample JSON, and you can see it live
#include <iostream>
#include <string>
#include <set>
#include "Customer.h"
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/json_parser.hpp>
#include <boost/foreach.hpp>
#include <boost/tuple/tuple_comparison.hpp>
using namespace std;
namespace {
boost::tuple<string const&, string const&, string const&> key_of(Customer const& c) {
return boost::tie(c.getFirstName(), c.getLastName(), c.getEmailAddress());
}
struct by_key {
bool operator()(Customer const& a, Customer const& b) const {
return key_of(a) < key_of(b);
}
};
inline ostream& operator<<(ostream& os, Customer const& c) {
return os << "{ '"
<< c.getFirstName() << "', '"
<< c.getLastName() << "', '"
<< c.getEmailAddress() << " }";
}
}
int main()
{
// parse the JSON file
boost::property_tree::ptree file;
boost::property_tree::read_json("customers.json", file);
cout << "Reading file..." << endl;
set<Customer, by_key> unique;
// iterate over each top level array
BOOST_FOREACH(boost::property_tree::ptree::value_type const& rowPair, file.get_child(""))
{
Customer current;
current.setFirstName ( rowPair.second.get ( "firstName", "?" ) ) ;
current.setLastName ( rowPair.second.get ( "lastName", "?" ) ) ;
current.setEmailAddress ( rowPair.second.get ( "emailAddress", "?" ) ) ;
if (unique.insert(current).second)
cout << current << "\n";
else
cout << "(duplicate skipped)\n";
}
cout << "\n" << (file.size() - unique.size()) << " duplicates were found\n";
}
Prints:
Reading file...
{ 'Sherron', 'Siverd', '51c985c5-381d-4d0e-b5ee-83005f39ce17.Siverd@somedomain.org }
{ 'Cletus', 'Defosses', 'ea4ad81f-4111-4d8d-8738-ecf857bba992.Defosses@somedomain.org }
(duplicate skipped)
{ 'Garry', 'Eirls', 'cc43c2da-d12c-467f-9318-beb3379f6509.Eirls@somedomain.org }
1 duplicates were found
NOTE I've adjusted the getters to be less wasteful by returning
const&
:std::string const& getFirstName() const { return m_firstName; } std::string const& getLastName() const { return m_lastName; } std::string const& getEmailAddress() const { return m_emailAddress; }
Here's the equivalent program in 26 lines of c++14 code: