检查有效的国家/地区名称Python Pandas

时间:2016-08-25 12:19:28

标签: python pandas

我有一个包含国家/地区的专栏。我想知道,有没有办法检查列是否包含任何无效的国家/地区名称并打印出来?

             Column1
             Hong Kong
             United States of America
             Malaysia 
             Thailand
             Canada
             Indi 
             Koreai
             Japan
             Australia

我的愿望结果将是

             Column1 
             Indi 
             Koreai

2 个答案:

答案 0 :(得分:1)

你可以从pycountry开始它是一个模块 - 但它是不准确的!它包含249个国家 - 联合国目前列出了241个国家。

这是我目前使用的

class nation(object):
    def __init__(self, un_code, un_name, un_3_str):
        self._code = int(un_code)
        self._name = un_name
        self._code3 = un_3_str

    @property
    def numeric(self):
        return self._code

    @property
    def name(self):
        return self._name

    @property
    def code(self):
        return self._code3


class UN_db(object):
    def __init__(self):
        self.UN = [("4", "Afghanistan", "AFG"),
                   ("248", "Aland Islands", "ALA"),
                   ("8", "Albania", "ALB"),
                   ("12", "Algeria", "DZA"),
                   ("16", "American Samoa", "ASM"),
                   ("20", "Andorra", "AND"),
                   ("24", "Angola", "AGO"),
                   ("660", "Anguilla", "AIA"),
                   ("28", "Antigua and Barbuda", "ATG"),
                   ("32", "Argentina", "ARG"),
                   ("51", "Armenia", "ARM"),
                   ("533", "Aruba", "ABW"),
                   ("36", "Australia", "AUS"),
                   ("40", "Austria", "AUT"),
                   ("31", "Azerbaijan", "AZE"),
                   ("44", "Bahamas", "BHS"),
                   ("48", "Bahrain", "BHR"),
                   ("50", "Bangladesh", "BGD"),
                   ("52", "Barbados", "BRB"),
                   ("112", "Belarus", "BLR"),
                   ("56", "Belgium", "BEL"),
                   ("84", "Belize", "BLZ"),
                   ("204", "Benin", "BEN"),
                   ("60", "Bermuda", "BMU"),
                   ("64", "Bhutan", "BTN"),
                   ("68", "Bolivia (Plurinational State of)", "BOL"),
                   ("535", "Bonaire, Sint Eustatius and Saba", "BES"),
                   ("70", "Bosnia and Herzegovina", "BIH"),
                   ("72", "Botswana", "BWA"),
                   ("76", "Brazil", "BRA"),
                   ("92", "British Virgin Islands", "VGB"),
                   ("96", "Brunei Darussalam", "BRN"),
                   ("100", "Bulgaria", "BGR"),
                   ("854", "Burkina Faso", "BFA"),
                   ("108", "Burundi", "BDI"),
                   ("132", "Cabo Verde", "CPV"),
                   ("116", "Cambodia", "KHM"),
                   ("120", "Cameroon", "CMR"),
                   ("124", "Canada", "CAN"),
                   ("136", "Cayman Islands", "CYM"),
                   ("140", "Central African Republic", "CAF"),
                   ("148", "Chad", "TCD"),
                   ("830", "Channel Islands", ""),
                   ("152", "Chile", "CHL"),
                   ("156", "China", "CHN"),
                   ("344", "China, Hong Kong Special Administrative Region", "HKG"),
                   ("446", "China, Macao Special Administrative Region", "MAC"),
                   ("170", "Colombia", "COL"),
                   ("174", "Comoros", "COM"),
                   ("178", "Congo", "COG"),
                   ("184", "Cook Islands", "COK"),
                   ("188", "Costa Rica", "CRI"),
                   ("384", "Cote d'Ivoire", "CIV"),
                   ("191", "Croatia", "HRV"),
                   ("192", "Cuba", "CUB"),
                   ("531", "Curacao", "CUW"),
                   ("196", "Cyprus", "CYP"),
                   ("203", "Czech Republic", "CZE"),
                   ("408", "Democratic People's Republic of Korea", "PRK"),
                   ("180", "Democratic Republic of the Congo", "COD"),
                   ("208", "Denmark", "DNK"),
                   ("262", "Djibouti", "DJI"),
                   ("212", "Dominica", "DMA"),
                   ("214", "Dominican Republic", "DOM"),
                   ("218", "Ecuador", "ECU"),
                   ("818", "Egypt", "EGY"),
                   ("222", "El Salvador", "SLV"),
                   ("226", "Equatorial Guinea", "GNQ"),
                   ("232", "Eritrea", "ERI"),
                   ("233", "Estonia", "EST"),
                   ("231", "Ethiopia", "ETH"),
                   ("234", "Faeroe Islands", "FRO"),
                   ("238", "Falkland Islands (Malvinas)", "FLK"),
                   ("242", "Fiji", "FJI"),
                   ("246", "Finland", "FIN"),
                   ("250", "France", "FRA"),
                   ("254", "French Guiana", "GUF"),
                   ("258", "French Polynesia", "PYF"),
                   ("266", "Gabon", "GAB"),
                   ("270", "Gambia", "GMB"),
                   ("268", "Georgia", "GEO"),
                   ("276", "Germany", "DEU"),
                   ("288", "Ghana", "GHA"),
                   ("292", "Gibraltar", "GIB"),
                   ("300", "Greece", "GRC"),
                   ("304", "Greenland", "GRL"),
                   ("308", "Grenada", "GRD"),
                   ("312", "Guadeloupe", "GLP"),
                   ("316", "Guam", "GUM"),
                   ("320", "Guatemala", "GTM"),
                   ("831", "Guernsey", "GGY"),
                   ("324", "Guinea", "GIN"),
                   ("624", "Guinea-Bissau", "GNB"),
                   ("328", "Guyana", "GUY"),
                   ("332", "Haiti", "HTI"),
                   ("336", "Holy See", "VAT"),
                   ("340", "Honduras", "HND"),
                   ("348", "Hungary", "HUN"),
                   ("352", "Iceland", "ISL"),
                   ("356", "India", "IND"),
                   ("360", "Indonesia", "IDN"),
                   ("364", "Iran (Islamic Republic of)", "IRN"),
                   ("368", "Iraq", "IRQ"),
                   ("372", "Ireland", "IRL"),
                   ("833", "Isle of Man", "IMN"),
                   ("376", "Israel", "ISR"),
                   ("380", "Italy", "ITA"),
                   ("388", "Jamaica", "JAM"),
                   ("392", "Japan", "JPN"),
                   ("832", "Jersey", "JEY"),
                   ("400", "Jordan", "JOR"),
                   ("398", "Kazakhstan", "KAZ"),
                   ("404", "Kenya", "KEN"),
                   ("296", "Kiribati", "KIR"),
                   ("414", "Kuwait", "KWT"),
                   ("417", "Kyrgyzstan", "KGZ"),
                   ("418", "Lao People's Democratic Republic", "LAO"),
                   ("428", "Latvia", "LVA"),
                   ("422", "Lebanon", "LBN"),
                   ("426", "Lesotho", "LSO"),
                   ("430", "Liberia", "LBR"),
                   ("434", "Libya", "LBY"),
                   ("438", "Liechtenstein", "LIE"),
                   ("440", "Lithuania", "LTU"),
                   ("442", "Luxembourg", "LUX"),
                   ("450", "Madagascar", "MDG"),
                   ("454", "Malawi", "MWI"),
                   ("458", "Malaysia", "MYS"),
                   ("462", "Maldives", "MDV"),
                   ("466", "Mali", "MLI"),
                   ("470", "Malta", "MLT"),
                   ("584", "Marshall Islands", "MHL"),
                   ("474", "Martinique", "MTQ"),
                   ("478", "Mauritania", "MRT"),
                   ("480", "Mauritius", "MUS"),
                   ("175", "Mayotte", "MYT"),
                   ("484", "Mexico", "MEX"),
                   ("583", "Micronesia (Federated States of)", "FSM"),
                   ("492", "Monaco", "MCO"),
                   ("496", "Mongolia", "MNG"),
                   ("499", "Montenegro", "MNE"),
                   ("500", "Montserrat", "MSR"),
                   ("504", "Morocco", "MAR"),
                   ("508", "Mozambique", "MOZ"),
                   ("104", "Myanmar", "MMR"),
                   ("516", "Namibia", "NAM"),
                   ("520", "Nauru", "NRU"),
                   ("524", "Nepal", "NPL"),
                   ("528", "Netherlands", "NLD"),
                   ("540", "New Caledonia", "NCL"),
                   ("554", "New Zealand", "NZL"),
                   ("558", "Nicaragua", "NIC"),
                   ("562", "Niger", "NER"),
                   ("566", "Nigeria", "NGA"),
                   ("570", "Niue", "NIU"),
                   ("574", "Norfolk Island", "NFK"),
                   ("580", "Northern Mariana Islands", "MNP"),
                   ("578", "Norway", "NOR"),
                   ("512", "Oman", "OMN"),
                   ("586", "Pakistan", "PAK"),
                   ("585", "Palau", "PLW"),
                   ("591", "Panama", "PAN"),
                   ("598", "Papua New Guinea", "PNG"),
                   ("600", "Paraguay", "PRY"),
                   ("604", "Peru", "PER"),
                   ("608", "Philippines", "PHL"),
                   ("612", "Pitcairn", "PCN"),
                   ("616", "Poland", "POL"),
                   ("620", "Portugal", "PRT"),
                   ("630", "Puerto Rico", "PRI"),
                   ("634", "Qatar", "QAT"),
                   ("410", "Republic of Korea", "KOR"),
                   ("498", "Republic of Moldova", "MDA"),
                   ("638", "Réunion", "REU"),
                   ("642", "Romania", "ROU"),
                   ("643", "Russian Federation", "RUS"),
                   ("646", "Rwanda", "RWA"),
                   ("652", "Saint Barthélemy", "BLM"),
                   ("654", "Saint Helena", "SHN"),
                   ("659", "Saint Kitts and Nevis", "KNA"),
                   ("662", "Saint Lucia", "LCA"),
                   ("663", "Saint Martin (French part)", "MAF"),
                   ("666", "Saint Pierre and Miquelon", "SPM"),
                   ("670", "Saint Vincent and the Grenadines", "VCT"),
                   ("882", "Samoa", "WSM"),
                   ("674", "San Marino", "SMR"),
                   ("678", "Sao Tome and Principe", "STP"),
                   ("680", "Sark", " "),
                   ("682", "Saudi Arabia", "SAU"),
                   ("686", "Senegal", "SEN"),
                   ("688", "Serbia", "SRB"),
                   ("690", "Seychelles", "SYC"),
                   ("694", "Sierra Leone", "SLE"),
                   ("702", "Singapore", "SGP"),
                   ("534", "Sint Maarten (Dutch part)", "SXM"),
                   ("703", "Slovakia", "SVK"),
                   ("705", "Slovenia", "SVN"),
                   ("90", "Solomon Islands", "SLB"),
                   ("706", "Somalia", "SOM"),
                   ("710", "South Africa", "ZAF"),
                   ("728", "South Sudan", "SSD"),
                   ("724", "Spain", "ESP"),
                   ("144", "Sri Lanka", "LKA"),
                   ("275", "State of Palestine", "PSE"),
                   ("729", "Sudan", "SDN"),
                   ("740", "Suriname", "SUR"),
                   ("744", "Svalbard and Jan Mayen Islands", "SJM"),
                   ("748", "Swaziland", "SWZ"),
                   ("752", "Sweden", "SWE"),
                   ("756", "Switzerland", "CHE"),
                   ("760", "Syrian Arab Republic", "SYR"),
                   ("762", "Tajikistan", "TJK"),
                   ("764", "Thailand", "THA"),
                   ("807", "The former Yugoslav Republic of Macedonia", "MKD"),
                   ("626", "Timor-Leste", "TLS"),
                   ("768", "Togo", "TGO"),
                   ("772", "Tokelau", "TKL"),
                   ("776", "Tonga", "TON"),
                   ("780", "Trinidad and Tobago", "TTO"),
                   ("788", "Tunisia", "TUN"),
                   ("792", "Turkey", "TUR"),
                   ("795", "Turkmenistan", "TKM"),
                   ("796", "Turks and Caicos Islands", "TCA"),
                   ("798", "Tuvalu", "TUV"),
                   ("800", "Uganda", "UGA"),
                   ("804", "Ukraine", "UKR"),
                   ("784", "United Arab Emirates", "ARE"),
                   ("826", "United Kingdom of Great Britain and Northern Ireland", "GBR"),
                   ("834", "United Republic of Tanzania", "TZA"),
                   ("840", "United States of America", "USA"),
                   ("850", "United States Virgin Islands", "VIR"),
                   ("858", "Uruguay", "URY"),
                   ("860", "Uzbekistan", "UZB"),
                   ("548", "Vanuatu", "VUT"),
                   ("862", "Venezuela (Bolivarian Republic of)", "VEN"),
                   ("704", "Viet Nam", "VNM"),
                   ("876", "Wallis and Futuna Islands", "WLF"),
                   ("732", "Western Sahara", "ESH"),
                   ("887", "Yemen", "YEM"),
                   ("894", "Zambia", "ZMB"),
                   ("716", "Zimbabwe", "ZWE"),
                   ("000", "000", "UNK")]

    @property
    def count(self):
        return len(self.UN)

    @property
    def UN_Codes(self):
        codes = [int(a[0]) for a in self.UN]
        return codes

    @property
    def Str_Codes(self):
        return [a[2] for a in self.UN]

    def getby_int(self, un_code_as_int):
        '''
        Look up UN code using integer i.e. Oman is 512
        :param un_code_as_int:
        :return:
        '''
        for ctry in self.UN:
            if int(ctry[0]) == un_code_as_int:
                return nation(ctry[0], ctry[1], ctry[2])
        return self.UN[:-1]

    def getby_code(self, un_code_as_code):
        '''
        Look up UN code using integer i.e. Oman is OMN
        :param un_code_as_code:
        :return: Tuple containing (Integer Code, String Code, Country Name)
        All return items are strings
        '''

        un_code_as_code = un_code_as_code.upper().lstrip().rstrip()

        for ctry in self.UN:
            if ctry[0] == un_code_as_code:
                return nation(ctry[0], ctry[1], ctry[2])
        return self.UN[:-1]

答案 1 :(得分:0)

您可以使用库 validate_country() 中的函数 DataPrep。使用 pip install dataprep 安装。

validate_country() 如果值为有效国家/地区则返回 True,否则返回 False。

from dataprep.clean import validate_country
df = pd.DataFrame({"column1": ["Hong Kong", "United States of America",
     "Malaysia", "Thailand", "Canada", "Indi", "Koreai", "Japan", "Australia"]})
srs = validate_country(df["column1"])
srs
0     True
1     True
2     True
3     True
4     True
5    False
6    False
7     True
8     True
Name: column1, dtype: bool

因此要仅打印无效的国家/地区,您可以使用 validate_country() 的输出为 DataFrame 建立索引:

df["column1"][~validate_country(df["column1"])]
5      Indi
6    Koreai
Name: column1, dtype: object