import pandas as pd
import re
import urllib2
data = urllib2.urlopen('http://www.census.gov/acs/www/Downloads/data_documentation/pums/DataDict/PUMSDataDict13.txt')
## replace newline characters so we can use dots and find everything until a double
## carriage return (replaced to ||) with a lookahead assertion.
datadict=pd.DataFrame(re.findall("([A-Z]{2,8})\s{2,9}([0-9]{1})\s{2,6}\|\s{2,4}([A-Za-z\-\(\) ]{3,85})",data,re.MULTILINE),columns=['variable','width','description'])
| | variable | width | description |
| 0 | RT | 1 | Record Type |
| 1 | SERIALNO | 7 | Housing unit |
| 2 | DIVISION | 1 | Division code |
| 3 | PUMA | 5 | Public use microdata area code (PUMA) based on |
| 4 | REGION | 1 | Region code |
| 5 | ST | 2 | State Code |
re.findall("([A-Z]{2,9})\s{2,9}([0-9]{1})\s{2,6}\|\s{4}([A-Za-z\-\(\)\;\<\> 0-9]{2,85})\|\s{11,15}([a-z0-9]{0,2})[ ]\.([A-Za-z/\-\(\) ]{2,120})",
| id | variable | width | description | value_1 | label_1 |
| 0 | DIVISION | 1 | Division code | 0 | Puerto Rico |
| 1 | REGION | 1 | Region code | 1 | Northeast |
| 2 | ST | 2 | State Code | 1 | Alabama/AL |
| 3 | NP | 2 | Number of person records following this housin... | 0 | Vacant unit |
| 4 | TYPE | 1 | Type of unit | 1 | Housing unit |
开始到结尾的多行匹配 - 即。一些变量有很多独特的值(ST
或state code
那么 - 如何重复多行模式任意次数。
答案 0 :(得分:1)
这不是正则表达式,但我使用下面的Python 3x脚本解析了PUMSDataDict2013.txt
(Census ACS 2013 documentation,FTP server)。我使用pandas.DataFrame.from_dict
Python 3x函数用于解析PUMSDataDict2013.txt
import collections
import os
def parse_pumsdatadict(path:str) -> collections.OrderedDict:
r"""Parse ACS PUMS Data Dictionaries.
path (str): Path to downloaded data dictionary.
ddict (collections.OrderedDict): Parsed data dictionary with original
key order preserved.
FileNotFoundError: Raised if `path` does not exist.
* Only some data dictionaries have been tested.[^urls]
* Values are all strings. No data types are inferred from the
original file.
* Example structure of returned `ddict`:
ddict['title'] = '2013 ACS PUMS DATA DICTIONARY'
ddict['date'] = 'August 7, 2015'
ddict['record_types']['HOUSING RECORD']['RT']\
['length'] = '1'
['description'] = 'Record Type'
['var_codes']['H'] = 'Housing Record or Group Quarters Unit'
ddict['record_types']['HOUSING RECORD'][...]
ddict['record_types']['PERSON RECORD'][...]
ddict['notes'] =
['Note for both Industry and Occupation lists...',
'* In cases where the SOC occupation code ends...',
[^urls]: http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/
# Check arguments.
if not os.path.exists(path):
raise FileNotFoundError(
"Path does not exist:\n{path}".format(path=path))
# Parse data dictionary.
# Note:
# * Data dictionary keys and values are "codes for variables",
# using the ACS terminology,
# https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html
# * The data dictionary is not all encoded in UTF-8. Replace encoding
# errors when found.
# * Catch instances of inconsistently formatted data.
ddict = collections.OrderedDict()
with open(path, encoding='utf-8', errors='replace') as fobj:
# Data dictionary name is line 1.
ddict['title'] = fobj.readline().strip()
# Data dictionary date is line 2.
ddict['date'] = fobj.readline().strip()
# Initialize flags to catch lines.
(catch_var_name, catch_var_desc,
catch_var_code, catch_var_note) = (None, )*4
var_name = None
var_name_last = 'PWGTP80' # Necessary for unformatted end-of-file notes.
for line in fobj:
# Replace tabs with 4 spaces
line = line.replace('\t', ' '*4).rstrip()
# Record type is section header 'HOUSING RECORD' or 'PERSON RECORD'.
if (line.strip() == 'HOUSING RECORD'
or line.strip() == 'PERSON RECORD'):
record_type = line.strip()
if 'record_types' not in ddict:
ddict['record_types'] = collections.OrderedDict()
ddict['record_types'][record_type] = collections.OrderedDict()
# A newline precedes a variable name.
# A newline follows the last variable code.
elif line == '':
# Example inconsistent format case:
# WGTP54 5
# Housing Weight replicate 54
# -9999..09999 .Integer weight of housing unit
if (catch_var_code
and 'var_codes' not in ddict['record_types'][record_type][var_name]):
# Terminate the previous variable block and look for the next
# variable name, unless past last variable name.
catch_var_code = False
catch_var_note = False
if var_name != var_name_last:
catch_var_name = True
# Variable name is 1 line with 0 space indent.
# Variable name is followed by variable description.
# Variable note is optional.
# Variable note is preceded by newline.
# Variable note is 1+ lines.
# Variable note is followed by newline.
elif (catch_var_name and not line.startswith(' ')
and var_name != var_name_last):
# Example: "Note: Public use microdata areas (PUMAs) ..."
if line.lower().startswith('note:'):
var_note = line.strip() # type(var_note) == str
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
# Append a new note.
catch_var_note = True
# Example: """
# Note: Public Use Microdata Areas (PUMAs) designate areas ...
# population. Use with ST for unique code. PUMA00 applies ...
# ...
# """
elif catch_var_note:
var_note = line.strip() # type(var_note) == str
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
# Concatenate to most recent note.
ddict['record_types'][record_type][var_name]['notes'][-1] += ' '+var_note
# Example: "NWAB 1 (UNEDITED - See 'Employment Status Recode' (ESR))"
# type(var_note) == list
(var_name, var_len, *var_note) = line.strip().split(maxsplit=2)
ddict['record_types'][record_type][var_name] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['length'] = var_len
# Append a new note if exists.
if len(var_note) > 0:
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
catch_var_name = False
catch_var_desc = True
var_desc_indent = None
# Variable description is 1+ lines with 1+ space indent.
# Variable description is followed by variable code(s).
# Variable code(s) is 1+ line with larger whitespace indent
# than variable description. Example:"""
# PUMA00 5
# Public use microdata area code (PUMA) based on Census 2000 definition for data
# collected prior to 2012. Use in combination with PUMA10.
# 00100..08200 .Public use microdata area codes
# 77777 .Combination of 01801, 01802, and 01905 in Louisiana
# -0009 .Code classification is Not Applicable because data
# .collected in 2012 or later
# """
# The last variable code is followed by a newline.
elif (catch_var_desc or catch_var_code) and line.startswith(' '):
indent = len(line) - len(line.lstrip())
# For line 1 of variable description.
if catch_var_desc and var_desc_indent is None:
var_desc_indent = indent
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] = var_desc
# For lines 2+ of variable description.
elif catch_var_desc and indent <= var_desc_indent:
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] += ' '+var_desc
# For lines 1+ of variable codes.
catch_var_desc = False
catch_var_code = True
is_valid_code = None
if not line.strip().startswith('.'):
# Example case: "01 .One person record (one person in household or"
if ' .' in line:
(var_code, var_code_desc) = line.strip().split(
sep=' .', maxsplit=1)
is_valid_code = True
# Example inconsistent format case:"""
# bbbb. N/A (age less than 15 years; never married)
# """
elif '. ' in line:
(var_code, var_code_desc) = line.strip().split(
sep='. ', maxsplit=1)
is_valid_code = True
raise AssertionError(
"Program error. Line unaccounted for:\n" +
if is_valid_code:
if 'var_codes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['var_codes'] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['var_codes'][var_code] = var_code_desc
# Example case: ".any person in group quarters)"
var_code_desc = line.strip().lstrip('.')
ddict['record_types'][record_type][var_name]['var_codes'][var_code] += ' '+var_code_desc
# Example inconsistent format case:"""
# Adjustment factor for housing dollar amounts (6 implied decimal places)
# """
elif (catch_var_desc and
'description' not in ddict['record_types'][record_type][var_name]):
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] = var_desc
catch_var_desc = False
catch_var_code = True
# Example inconsistent format case:"""
# WGTP10 5
# Housing Weight replicate 10
# -9999..09999 .Integer weight of housing unit
# WGTP11 5
# Housing Weight replicate 11
# -9999..09999 .Integer weight of housing unit
# """
elif ((var_name == 'WGTP10' and 'WGTP11' in line)
or (var_name == 'YOEP12' and 'ANC' in line)):
# type(var_note) == list
(var_name, var_len, *var_note) = line.strip().split(maxsplit=2)
ddict['record_types'][record_type][var_name] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['length'] = var_len
if len(var_note) > 0:
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
catch_var_name = False
catch_var_desc = True
var_desc_indent = None
if (catch_var_name, catch_var_desc,
catch_var_code, catch_var_note) != (False, )*4:
raise AssertionError(
"Program error. All flags to catch lines should be set " +
"to `False` by end-of-file.")
if var_name != var_name_last:
raise AssertionError(
"Program error. End-of-file notes should only be read "+
"after `var_name_last` has been processed.")
if 'notes' not in ddict:
ddict['notes'] = list()
return ddict
创建分层数据框(下面格式为Jupyter Notebook单元格):
In [ ]:
import pandas as pd
ddict = parse_pumsdatadict(path=r'/path/to/PUMSDataDict2013.txt')
tmp = dict()
for record_type in ddict['record_types']:
tmp[record_type] = pd.DataFrame.from_dict(ddict['record_types'][record_type], orient='index')
df_ddict = pd.concat(tmp, names=['record_type', 'var_name'])
Out[ ]:
# Click "Run code snippet" below to render the output from `df_ddict.head()`.
<table border="1" class="dataframe">
<tr style="text-align: right;">
<th rowspan="5" valign="top">HOUSING RECORD</th>
<td>Access to the Internet</td>
<td>{'b': 'N/A (GQ)', '1': 'Yes, with subscription...</td>
<td>Lot size</td>
<td>{'b': 'N/A (GQ/not a one-family house or mobil...</td>
<td>Adjustment factor for housing dollar amounts (...</td>
<td>{'1000000': '2013 factor (1.000000)'}</td>
<td>[Note: The value of ADJHSG inflation-adjusts r...</td>
<td>Adjustment factor for income and earnings doll...</td>
<td>{'1007549': '2013 factor (1.007549)'}</td>
<td>[Note: The value of ADJINC inflation-adjusts r...</td>
<td>Sales of Agriculture Products (Yearly sales)</td>
<td>{'b': 'N/A (GQ/vacant/not a one family house o...</td>
<td>[Note: no adjustment factor is applied to AGS.]</td>