# dbtk/etl/transforms/address.py
"""
Address parsing, validation, and standardization.
Uses usaddress library for US address parsing with custom normalization.
Supports both generic international addresses and US-specific validation.
Required dependency:
pip install usaddress
"""
import re
from typing import Any, Dict, Optional
try:
import usaddress
from usaddress import RepeatedLabelError
HAS_USADDRESS = True
except ImportError:
HAS_USADDRESS = False
RepeatedLabelError = Exception
# US state codes for validation
US_STATES = {
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY',
'DC', 'AS', 'GU', 'MP', 'PR', 'VI' # Territories
}
# Zip code patterns
ZIP_PATTERN = re.compile(r'^\d{5}(-\d{4})?$')
# Normalization mappings for USPS abbreviations
STREET_TYPES = {
'ALLEY': 'Aly', 'ANNEX': 'Anx', 'ARCADE': 'Arc', 'AVENUE': 'Ave',
'BAYOU': 'Byu', 'BEACH': 'Bch', 'BEND': 'Bnd', 'BLUFF': 'Blf',
'BOTTOM': 'Btm', 'BOULEVARD': 'Blvd', 'BRANCH': 'Br', 'BRIDGE': 'Brg',
'BROOK': 'Brk', 'BURG': 'Bg', 'BYPASS': 'Byp', 'CAMP': 'Cp',
'CANYON': 'Cyn', 'CAPE': 'Cpe', 'CAUSEWAY': 'Cswy', 'CENTER': 'Ctr',
'CIRCLE': 'Cir', 'CLIFF': 'Clf', 'CLUB': 'Clb', 'COMMON': 'Cmn',
'CORNER': 'Cor', 'COURSE': 'Crse', 'COURT': 'Ct', 'COVE': 'Cv',
'CREEK': 'Crk', 'CRESCENT': 'Cres', 'CROSSING': 'Xing', 'DALE': 'Dl',
'DAM': 'Dm', 'DIVIDE': 'Dv', 'DRIVE': 'Dr', 'ESTATE': 'Est',
'EXPRESSWAY': 'Expy', 'EXTENSION': 'Ext', 'FALL': 'Fall', 'FERRY': 'Fry',
'FIELD': 'Fld', 'FLAT': 'Flt', 'FORD': 'Frd', 'FOREST': 'Frst',
'FORGE': 'Frg', 'FORK': 'Frk', 'FORT': 'Ft', 'FREEWAY': 'Fwy',
'GARDEN': 'Gdn', 'GATEWAY': 'Gtwy', 'GLEN': 'Gln', 'GREEN': 'Grn',
'GROVE': 'Grv', 'HARBOR': 'Hbr', 'HAVEN': 'Hvn', 'HEIGHTS': 'Hts',
'HIGHWAY': 'Hwy', 'HILL': 'Hl', 'HOLLOW': 'Holw', 'INLET': 'Inlt',
'ISLAND': 'Is', 'ISLE': 'Isle', 'JUNCTION': 'Jct', 'KEY': 'Ky',
'KNOLL': 'Knl', 'LAKE': 'Lk', 'LAND': 'Land', 'LANDING': 'Lndg',
'LANE': 'Ln', 'LIGHT': 'Lgt', 'LOAF': 'Lf', 'LOCK': 'Lck',
'LODGE': 'Ldg', 'LOOP': 'Loop', 'MALL': 'Mall', 'MANOR': 'Mnr',
'MEADOW': 'Mdw', 'MILL': 'Ml', 'MISSION': 'Msn', 'MOUNT': 'Mt',
'MOUNTAIN': 'Mtn', 'NECK': 'Nck', 'ORCHARD': 'Orch', 'PARK': 'Park',
'PARKWAY': 'Pkwy', 'PASS': 'Pass', 'PATH': 'Path', 'PIKE': 'Pike',
'PINE': 'Pne', 'PLACE': 'Pl', 'PLAIN': 'Pln', 'PLAZA': 'Plz',
'POINT': 'Pt', 'PORT': 'Prt', 'PRAIRIE': 'Pr', 'RADIAL': 'Radl',
'RANCH': 'Rnch', 'RAPID': 'Rpd', 'REST': 'Rst', 'RIDGE': 'Rdg',
'RIVER': 'Riv', 'ROAD': 'Rd', 'ROUTE': 'Rte', 'ROW': 'Row',
'RUN': 'Run', 'SHOAL': 'Shl', 'SHORE': 'Shr', 'SPRING': 'Spg',
'SPUR': 'Spur', 'SQUARE': 'Sq', 'STATION': 'Sta', 'STREAM': 'Strm',
'STREET': 'St', 'SUMMIT': 'Smt', 'TERRACE': 'Ter', 'TRACE': 'Trce',
'TRACK': 'Trak', 'TRAIL': 'Trl', 'TUNNEL': 'Tunl', 'TURNPIKE': 'Tpke',
'UNDERPASS': 'Upas', 'UNION': 'Un', 'VALLEY': 'Vly', 'VIADUCT': 'Via',
'VIEW': 'Vw', 'VILLAGE': 'Vlg', 'VILLE': 'Vl', 'VISTA': 'Vis',
'WALK': 'Walk', 'WALL': 'Wall', 'WAY': 'Way', 'WELL': 'Wl'
}
DIRECTIONALS = {
'NORTH': 'N', 'SOUTH': 'S', 'EAST': 'E', 'WEST': 'W',
'NORTHEAST': 'NE', 'NORTHWEST': 'NW', 'SOUTHEAST': 'SE', 'SOUTHWEST': 'SW'
}
OCCUPANCY_TYPES = {
'APARTMENT': 'Apt', 'BASEMENT': 'Bsmt', 'BUILDING': 'Bldg',
'DEPARTMENT': 'Dept', 'FLOOR': 'Fl', 'FRONT': 'Frnt',
'HANGAR': 'Hngr', 'LOBBY': 'Lbby', 'LOT': 'Lot',
'LOWER': 'Lowr', 'OFFICE': 'Ofc', 'PENTHOUSE': 'Ph',
'PIER': 'Pier', 'REAR': 'Rear', 'ROOM': 'Rm',
'SIDE': 'Side', 'SLIP': 'Slip', 'SPACE': 'Spc',
'STOP': 'Stop', 'SUITE': 'Ste', 'TRAILER': 'Trlr',
'UNIT': 'Unit', 'UPPER': 'Uppr'
}
def _check_usaddress():
"""Raise helpful error if usaddress not installed."""
if not HAS_USADDRESS:
raise ImportError(
"Address functionality requires the usaddress library. "
"Install with: pip install usaddress==0.5.10 (Python 3.6) or pip install usaddress (Python 3.7+)"
)
def _normalize_component(value: str, mapping: Dict[str, str]) -> str:
"""Normalize a component using the provided mapping."""
if not value:
return value
upper_value = value.upper().strip()
return mapping.get(upper_value, value.title())
def _build_address_line(components: Dict[str, str]) -> str:
"""Build address line 1 from parsed components."""
parts = []
# Address number prefix (rare)
if components.get('AddressNumberPrefix'):
parts.append(components['AddressNumberPrefix'])
# Address number
if components.get('AddressNumber'):
parts.append(components['AddressNumber'])
# Address number suffix (rare)
if components.get('AddressNumberSuffix'):
parts.append(components['AddressNumberSuffix'])
# Street name pre-directional
if components.get('StreetNamePreDirectional'):
parts.append(_normalize_component(
components['StreetNamePreDirectional'], DIRECTIONALS))
# Street name pre-modifier (rare)
if components.get('StreetNamePreModifier'):
parts.append(components['StreetNamePreModifier'].title())
# Street name pre-type (rare, e.g., "Avenue A")
if components.get('StreetNamePreType'):
parts.append(_normalize_component(
components['StreetNamePreType'], STREET_TYPES))
# Street name
if components.get('StreetName'):
parts.append(components['StreetName'].title())
# Street name post-type
if components.get('StreetNamePostType'):
parts.append(_normalize_component(
components['StreetNamePostType'], STREET_TYPES))
# Street name post-directional
if components.get('StreetNamePostDirectional'):
parts.append(_normalize_component(
components['StreetNamePostDirectional'], DIRECTIONALS))
return ' '.join(parts)
def _build_address_line_2(components: Dict[str, str]) -> Optional[str]:
"""Build address line 2 from parsed components."""
parts = []
# Building name
if components.get('BuildingName'):
parts.append(components['BuildingName'].title())
# Occupancy type and identifier
if components.get('OccupancyType'):
occ_type = _normalize_component(
components['OccupancyType'], OCCUPANCY_TYPES)
parts.append(occ_type)
if components.get('OccupancyIdentifier'):
parts.append(components['OccupancyIdentifier'])
# Subaddress type and identifier (alternative to occupancy)
elif components.get('SubaddressType'):
sub_type = _normalize_component(
components['SubaddressType'], OCCUPANCY_TYPES)
parts.append(sub_type)
if components.get('SubaddressIdentifier'):
parts.append(components['SubaddressIdentifier'])
# USPS Box
if components.get('USPSBoxType'):
parts.append(components['USPSBoxType'])
if components.get('USPSBoxID'):
parts.append(components['USPSBoxID'])
return ' '.join(parts) if parts else None
class Address:
"""
Generic address parser and formatter.
Parses address strings into components and provides standardized formatting.
Works with any address format but optimized for US addresses.
Example
-------
::
addr = Address("123 Main St, Springfield, IL 62701")
print(addr.street_number) # "123"
print(addr.street_name) # "Main"
print(addr.street_type) # "St"
print(addr.city) # "Springfield"
print(addr.state) # "IL"
print(addr.postal_code) # "62701"
print(addr.format()) # Standardized format
# From components
addr = Address({
'address_line_1': '123 Main St',
'city': 'Springfield',
'state': 'IL',
'postal_code': '62701'
})
"""
def __init__(self, value: Any):
"""
Parse address from string or dictionary.
Args:
value: Address string or dictionary of components
"""
_check_usaddress()
self.raw = value if isinstance(value, str) else None
self._components = {}
self._parsed_components = {}
self._parsed = False
if isinstance(value, str):
self._parse_string(value)
elif isinstance(value, dict):
self._components = value.copy()
self._parsed = True
else:
self.raw = str(value) if value else ''
self._parse_string(self.raw)
def _parse_string(self, address_string: str):
"""Parse address string into components."""
if not address_string or not address_string.strip():
return
try:
# Use usaddress to parse
tagged, address_type = usaddress.tag(address_string)
self._parsed_components = dict(tagged)
# Build normalized components
self._components = {
'address_line_1': _build_address_line(self._parsed_components),
'address_line_2': _build_address_line_2(self._parsed_components),
'city': self._parsed_components.get('PlaceName', '').title() if self._parsed_components.get(
'PlaceName') else None,
'state': self._parsed_components.get('StateName', '').upper() if self._parsed_components.get(
'StateName') else None,
'postal_code': self._parsed_components.get('ZipCode'),
}
# Remove None values
self._components = {k: v for k, v in self._components.items() if v}
self._parsed = True
except RepeatedLabelError:
# If parsing fails with ambiguous results, store as-is
self._components = {'address_line_1': address_string}
self._parsed = False
@property
def address_line_1(self) -> Optional[str]:
"""First line of address (street)."""
return self._components.get('address_line_1')
@property
def address_line_2(self) -> Optional[str]:
"""Second line of address (unit, apt, etc.)."""
return self._components.get('address_line_2')
@property
def city(self) -> Optional[str]:
"""City name."""
return self._components.get('city')
@property
def state(self) -> Optional[str]:
"""State/province/region code."""
return self._components.get('state')
@property
def postal_code(self) -> Optional[str]:
"""Postal/zip code."""
return self._components.get('postal_code')
@property
def country(self) -> Optional[str]:
"""Country code or name."""
return self._components.get('country')
# Component properties
@property
def street_number(self) -> Optional[str]:
"""Street number/building number."""
return self._parsed_components.get('AddressNumber')
@property
def street_name(self) -> Optional[str]:
"""Street name without number or type."""
name = self._parsed_components.get('StreetName')
return name.title() if name else None
@property
def street_type(self) -> Optional[str]:
"""Street type (St, Ave, Rd, etc.)."""
street_type = self._parsed_components.get('StreetNamePostType')
if street_type:
return _normalize_component(street_type, STREET_TYPES)
return None
@property
def unit_number(self) -> Optional[str]:
"""Unit/apartment/suite number."""
return (self._parsed_components.get('OccupancyIdentifier') or
self._parsed_components.get('SubaddressIdentifier'))
@property
def components(self) -> Dict[str, str]:
"""Get all address components as dictionary."""
return self._components.copy()
def format(self, style: str = 'standard') -> str:
"""
Format address in specified style.
Args:
style: Format style ('standard', 'single_line', 'multiline')
Returns:
Formatted address string
Example
-------
::
addr.format('standard') # "123 Main St, Springfield, IL 62701"
addr.format('single_line') # "123 Main St Springfield IL 62701"
addr.format('multiline') # Multi-line format
"""
if not self._parsed or not self._components:
return self.raw or ''
line1 = self.address_line_1 or ''
line2 = self.address_line_2 or ''
city = self.city or ''
state = self.state or ''
postal = self.postal_code or ''
if style == 'single_line':
parts = [line1, line2, city, state, postal]
return ' '.join(p for p in parts if p)
elif style == 'multiline':
lines = []
if line1:
lines.append(line1)
if line2:
lines.append(line2)
if city or state or postal:
city_line = ', '.join(p for p in [city, state, postal] if p)
lines.append(city_line)
return '\n'.join(lines)
else: # standard
parts = [line1]
if line2:
parts.append(line2)
if city or state or postal:
city_line = f"{city}, {state} {postal}".strip()
parts.append(city_line)
return ', '.join(p for p in parts if p)
def __str__(self) -> str:
"""Default string representation (standard format)."""
return self.format('standard')
class USAddress(Address):
"""
US-specific address with validation.
Extends Address with US-specific validation rules including:
- Valid 2-letter state code
- Valid ZIP code format (5 digits or 5+4)
- Required components (street, city, state, zip)
Example
-------
::
addr = USAddress("123 Main St, Springfield, IL 62701")
print(addr.is_valid) # True
addr = USAddress("123 Main St, Springfield, XX 12345")
print(addr.is_valid) # False (invalid state)
print(addr.validation_errors) # ["Invalid state code: XX"]
# Check individual components
addr = USAddress("Springfield, IL")
print(addr.has_street) # False
print(addr.has_city) # True
print(addr.has_state) # True
print(addr.has_zip) # False
"""
def __init__(self, value: Any):
"""
Parse and validate US address.
Args:
value: Address string or dictionary of components
"""
super().__init__(value)
self._validation_errors = None
@property
def has_street(self) -> bool:
"""Check if address has street component."""
return bool(self.address_line_1)
@property
def has_city(self) -> bool:
"""Check if address has city component."""
return bool(self.city)
@property
def has_state(self) -> bool:
"""Check if address has state component."""
return bool(self.state)
@property
def has_zip(self) -> bool:
"""Check if address has postal code component."""
return bool(self.postal_code)
@property
def is_valid_state(self) -> bool:
"""Check if state code is valid US state."""
if not self.state:
return False
return self.state.upper() in US_STATES
@property
def is_valid_zip(self) -> bool:
"""Check if ZIP code format is valid."""
if not self.postal_code:
return False
return bool(ZIP_PATTERN.match(self.postal_code))
@property
def validation_errors(self) -> list:
"""Get list of validation errors."""
if self._validation_errors is None:
self._validation_errors = self._validate()
return self._validation_errors
def _validate(self) -> list:
"""Perform validation and return list of errors."""
errors = []
if not self.has_street:
errors.append("Missing street address")
if not self.has_city:
errors.append("Missing city")
if not self.has_state:
errors.append("Missing state")
elif not self.is_valid_state:
errors.append(f"Invalid state code: {self.state}")
if not self.has_zip:
errors.append("Missing ZIP code")
elif not self.is_valid_zip:
errors.append(f"Invalid ZIP code format: {self.postal_code}")
return errors
@property
def is_valid(self) -> bool:
"""Check if address is valid according to US rules."""
return len(self.validation_errors) == 0
def validate(self) -> bool:
"""
Validate the address.
Returns:
True if valid, False otherwise
Note: Use validation_errors property to see specific issues
"""
return self.is_valid
# Convenience functions
def parse_address(address: str) -> Dict[str, str]:
"""
Parse address string into components.
Args:
address: Address string to parse
Returns:
Dictionary of address components
Example:
components = parse_address("123 Main St, Springfield, IL 62701")
# {'address_line_1': '123 Main St', 'city': 'Springfield', ...}
"""
addr = Address(address)
return addr.components
[docs]
def standardize_address(address: str, style: str = 'standard') -> str:
"""
Standardize address formatting.
Args:
address: Address string to standardize
style: Format style ('standard', 'single_line', 'multiline')
Returns:
Standardized address string
Example:
standardize_address("123 main street, springfield il 62701")
# "123 Main St, Springfield, IL 62701"
"""
addr = Address(address)
return addr.format(style)
[docs]
def validate_us_address(address: str) -> bool:
"""
Validate US address format and components.
Args:
address: Address string to validate
Returns:
True if valid US address, False otherwise
Example:
validate_us_address("123 Main St, Springfield, IL 62701") # True
validate_us_address("123 Main St, Springfield, XX 12345") # False
"""
addr = USAddress(address)
return addr.is_valid