我有一个包含国家/地区的专栏。我想知道,有没有办法检查列是否包含任何无效的国家/地区名称并打印出来?
Column1
Hong Kong
United States of America
Malaysia
Thailand
Canada
Indi
Koreai
Japan
Australia
我的愿望结果将是
Column1
Indi
Koreai
答案 0 :(得分:1)
你可以从pycountry开始它是一个模块 - 但它是不准确的!它包含249个国家 - 联合国目前列出了241个国家。
这是我目前使用的
class nation(object):
def __init__(self, un_code, un_name, un_3_str):
self._code = int(un_code)
self._name = un_name
self._code3 = un_3_str
@property
def numeric(self):
return self._code
@property
def name(self):
return self._name
@property
def code(self):
return self._code3
class UN_db(object):
def __init__(self):
self.UN = [("4", "Afghanistan", "AFG"),
("248", "Aland Islands", "ALA"),
("8", "Albania", "ALB"),
("12", "Algeria", "DZA"),
("16", "American Samoa", "ASM"),
("20", "Andorra", "AND"),
("24", "Angola", "AGO"),
("660", "Anguilla", "AIA"),
("28", "Antigua and Barbuda", "ATG"),
("32", "Argentina", "ARG"),
("51", "Armenia", "ARM"),
("533", "Aruba", "ABW"),
("36", "Australia", "AUS"),
("40", "Austria", "AUT"),
("31", "Azerbaijan", "AZE"),
("44", "Bahamas", "BHS"),
("48", "Bahrain", "BHR"),
("50", "Bangladesh", "BGD"),
("52", "Barbados", "BRB"),
("112", "Belarus", "BLR"),
("56", "Belgium", "BEL"),
("84", "Belize", "BLZ"),
("204", "Benin", "BEN"),
("60", "Bermuda", "BMU"),
("64", "Bhutan", "BTN"),
("68", "Bolivia (Plurinational State of)", "BOL"),
("535", "Bonaire, Sint Eustatius and Saba", "BES"),
("70", "Bosnia and Herzegovina", "BIH"),
("72", "Botswana", "BWA"),
("76", "Brazil", "BRA"),
("92", "British Virgin Islands", "VGB"),
("96", "Brunei Darussalam", "BRN"),
("100", "Bulgaria", "BGR"),
("854", "Burkina Faso", "BFA"),
("108", "Burundi", "BDI"),
("132", "Cabo Verde", "CPV"),
("116", "Cambodia", "KHM"),
("120", "Cameroon", "CMR"),
("124", "Canada", "CAN"),
("136", "Cayman Islands", "CYM"),
("140", "Central African Republic", "CAF"),
("148", "Chad", "TCD"),
("830", "Channel Islands", ""),
("152", "Chile", "CHL"),
("156", "China", "CHN"),
("344", "China, Hong Kong Special Administrative Region", "HKG"),
("446", "China, Macao Special Administrative Region", "MAC"),
("170", "Colombia", "COL"),
("174", "Comoros", "COM"),
("178", "Congo", "COG"),
("184", "Cook Islands", "COK"),
("188", "Costa Rica", "CRI"),
("384", "Cote d'Ivoire", "CIV"),
("191", "Croatia", "HRV"),
("192", "Cuba", "CUB"),
("531", "Curacao", "CUW"),
("196", "Cyprus", "CYP"),
("203", "Czech Republic", "CZE"),
("408", "Democratic People's Republic of Korea", "PRK"),
("180", "Democratic Republic of the Congo", "COD"),
("208", "Denmark", "DNK"),
("262", "Djibouti", "DJI"),
("212", "Dominica", "DMA"),
("214", "Dominican Republic", "DOM"),
("218", "Ecuador", "ECU"),
("818", "Egypt", "EGY"),
("222", "El Salvador", "SLV"),
("226", "Equatorial Guinea", "GNQ"),
("232", "Eritrea", "ERI"),
("233", "Estonia", "EST"),
("231", "Ethiopia", "ETH"),
("234", "Faeroe Islands", "FRO"),
("238", "Falkland Islands (Malvinas)", "FLK"),
("242", "Fiji", "FJI"),
("246", "Finland", "FIN"),
("250", "France", "FRA"),
("254", "French Guiana", "GUF"),
("258", "French Polynesia", "PYF"),
("266", "Gabon", "GAB"),
("270", "Gambia", "GMB"),
("268", "Georgia", "GEO"),
("276", "Germany", "DEU"),
("288", "Ghana", "GHA"),
("292", "Gibraltar", "GIB"),
("300", "Greece", "GRC"),
("304", "Greenland", "GRL"),
("308", "Grenada", "GRD"),
("312", "Guadeloupe", "GLP"),
("316", "Guam", "GUM"),
("320", "Guatemala", "GTM"),
("831", "Guernsey", "GGY"),
("324", "Guinea", "GIN"),
("624", "Guinea-Bissau", "GNB"),
("328", "Guyana", "GUY"),
("332", "Haiti", "HTI"),
("336", "Holy See", "VAT"),
("340", "Honduras", "HND"),
("348", "Hungary", "HUN"),
("352", "Iceland", "ISL"),
("356", "India", "IND"),
("360", "Indonesia", "IDN"),
("364", "Iran (Islamic Republic of)", "IRN"),
("368", "Iraq", "IRQ"),
("372", "Ireland", "IRL"),
("833", "Isle of Man", "IMN"),
("376", "Israel", "ISR"),
("380", "Italy", "ITA"),
("388", "Jamaica", "JAM"),
("392", "Japan", "JPN"),
("832", "Jersey", "JEY"),
("400", "Jordan", "JOR"),
("398", "Kazakhstan", "KAZ"),
("404", "Kenya", "KEN"),
("296", "Kiribati", "KIR"),
("414", "Kuwait", "KWT"),
("417", "Kyrgyzstan", "KGZ"),
("418", "Lao People's Democratic Republic", "LAO"),
("428", "Latvia", "LVA"),
("422", "Lebanon", "LBN"),
("426", "Lesotho", "LSO"),
("430", "Liberia", "LBR"),
("434", "Libya", "LBY"),
("438", "Liechtenstein", "LIE"),
("440", "Lithuania", "LTU"),
("442", "Luxembourg", "LUX"),
("450", "Madagascar", "MDG"),
("454", "Malawi", "MWI"),
("458", "Malaysia", "MYS"),
("462", "Maldives", "MDV"),
("466", "Mali", "MLI"),
("470", "Malta", "MLT"),
("584", "Marshall Islands", "MHL"),
("474", "Martinique", "MTQ"),
("478", "Mauritania", "MRT"),
("480", "Mauritius", "MUS"),
("175", "Mayotte", "MYT"),
("484", "Mexico", "MEX"),
("583", "Micronesia (Federated States of)", "FSM"),
("492", "Monaco", "MCO"),
("496", "Mongolia", "MNG"),
("499", "Montenegro", "MNE"),
("500", "Montserrat", "MSR"),
("504", "Morocco", "MAR"),
("508", "Mozambique", "MOZ"),
("104", "Myanmar", "MMR"),
("516", "Namibia", "NAM"),
("520", "Nauru", "NRU"),
("524", "Nepal", "NPL"),
("528", "Netherlands", "NLD"),
("540", "New Caledonia", "NCL"),
("554", "New Zealand", "NZL"),
("558", "Nicaragua", "NIC"),
("562", "Niger", "NER"),
("566", "Nigeria", "NGA"),
("570", "Niue", "NIU"),
("574", "Norfolk Island", "NFK"),
("580", "Northern Mariana Islands", "MNP"),
("578", "Norway", "NOR"),
("512", "Oman", "OMN"),
("586", "Pakistan", "PAK"),
("585", "Palau", "PLW"),
("591", "Panama", "PAN"),
("598", "Papua New Guinea", "PNG"),
("600", "Paraguay", "PRY"),
("604", "Peru", "PER"),
("608", "Philippines", "PHL"),
("612", "Pitcairn", "PCN"),
("616", "Poland", "POL"),
("620", "Portugal", "PRT"),
("630", "Puerto Rico", "PRI"),
("634", "Qatar", "QAT"),
("410", "Republic of Korea", "KOR"),
("498", "Republic of Moldova", "MDA"),
("638", "Réunion", "REU"),
("642", "Romania", "ROU"),
("643", "Russian Federation", "RUS"),
("646", "Rwanda", "RWA"),
("652", "Saint Barthélemy", "BLM"),
("654", "Saint Helena", "SHN"),
("659", "Saint Kitts and Nevis", "KNA"),
("662", "Saint Lucia", "LCA"),
("663", "Saint Martin (French part)", "MAF"),
("666", "Saint Pierre and Miquelon", "SPM"),
("670", "Saint Vincent and the Grenadines", "VCT"),
("882", "Samoa", "WSM"),
("674", "San Marino", "SMR"),
("678", "Sao Tome and Principe", "STP"),
("680", "Sark", " "),
("682", "Saudi Arabia", "SAU"),
("686", "Senegal", "SEN"),
("688", "Serbia", "SRB"),
("690", "Seychelles", "SYC"),
("694", "Sierra Leone", "SLE"),
("702", "Singapore", "SGP"),
("534", "Sint Maarten (Dutch part)", "SXM"),
("703", "Slovakia", "SVK"),
("705", "Slovenia", "SVN"),
("90", "Solomon Islands", "SLB"),
("706", "Somalia", "SOM"),
("710", "South Africa", "ZAF"),
("728", "South Sudan", "SSD"),
("724", "Spain", "ESP"),
("144", "Sri Lanka", "LKA"),
("275", "State of Palestine", "PSE"),
("729", "Sudan", "SDN"),
("740", "Suriname", "SUR"),
("744", "Svalbard and Jan Mayen Islands", "SJM"),
("748", "Swaziland", "SWZ"),
("752", "Sweden", "SWE"),
("756", "Switzerland", "CHE"),
("760", "Syrian Arab Republic", "SYR"),
("762", "Tajikistan", "TJK"),
("764", "Thailand", "THA"),
("807", "The former Yugoslav Republic of Macedonia", "MKD"),
("626", "Timor-Leste", "TLS"),
("768", "Togo", "TGO"),
("772", "Tokelau", "TKL"),
("776", "Tonga", "TON"),
("780", "Trinidad and Tobago", "TTO"),
("788", "Tunisia", "TUN"),
("792", "Turkey", "TUR"),
("795", "Turkmenistan", "TKM"),
("796", "Turks and Caicos Islands", "TCA"),
("798", "Tuvalu", "TUV"),
("800", "Uganda", "UGA"),
("804", "Ukraine", "UKR"),
("784", "United Arab Emirates", "ARE"),
("826", "United Kingdom of Great Britain and Northern Ireland", "GBR"),
("834", "United Republic of Tanzania", "TZA"),
("840", "United States of America", "USA"),
("850", "United States Virgin Islands", "VIR"),
("858", "Uruguay", "URY"),
("860", "Uzbekistan", "UZB"),
("548", "Vanuatu", "VUT"),
("862", "Venezuela (Bolivarian Republic of)", "VEN"),
("704", "Viet Nam", "VNM"),
("876", "Wallis and Futuna Islands", "WLF"),
("732", "Western Sahara", "ESH"),
("887", "Yemen", "YEM"),
("894", "Zambia", "ZMB"),
("716", "Zimbabwe", "ZWE"),
("000", "000", "UNK")]
@property
def count(self):
return len(self.UN)
@property
def UN_Codes(self):
codes = [int(a[0]) for a in self.UN]
return codes
@property
def Str_Codes(self):
return [a[2] for a in self.UN]
def getby_int(self, un_code_as_int):
'''
Look up UN code using integer i.e. Oman is 512
:param un_code_as_int:
:return:
'''
for ctry in self.UN:
if int(ctry[0]) == un_code_as_int:
return nation(ctry[0], ctry[1], ctry[2])
return self.UN[:-1]
def getby_code(self, un_code_as_code):
'''
Look up UN code using integer i.e. Oman is OMN
:param un_code_as_code:
:return: Tuple containing (Integer Code, String Code, Country Name)
All return items are strings
'''
un_code_as_code = un_code_as_code.upper().lstrip().rstrip()
for ctry in self.UN:
if ctry[0] == un_code_as_code:
return nation(ctry[0], ctry[1], ctry[2])
return self.UN[:-1]
答案 1 :(得分:0)
您可以使用库 validate_country()
中的函数 DataPrep。使用 pip install dataprep
安装。
validate_country()
如果值为有效国家/地区则返回 True,否则返回 False。
from dataprep.clean import validate_country
df = pd.DataFrame({"column1": ["Hong Kong", "United States of America",
"Malaysia", "Thailand", "Canada", "Indi", "Koreai", "Japan", "Australia"]})
srs = validate_country(df["column1"])
srs
0 True
1 True
2 True
3 True
4 True
5 False
6 False
7 True
8 True
Name: column1, dtype: bool
因此要仅打印无效的国家/地区,您可以使用 validate_country()
的输出为 DataFrame 建立索引:
df["column1"][~validate_country(df["column1"])]
5 Indi
6 Koreai
Name: column1, dtype: object