Source code for dbtk.readers.csv

# dbtk/readers/csv.py

"""CSV file reader with flexible delimiter and quoting support."""

import csv
import os
from typing import TextIO, List, Any, Iterator, Optional
from .base import Reader, logger
from csv import excel_tab

[docs] class CSVReader(Reader): """ Read CSV (Comma-Separated Values) files with flexible formatting options. CSVReader provides a simple, consistent interface for reading CSV files with support for various delimiters, quoting styles, and header handling. It handles messy real-world CSV files by providing automatic header cleaning, custom dialects, and the ability to override headers entirely. The reader returns Record objects by default (supporting attribute, key, and index access) or plain dictionaries if preferred. It automatically handles tab-delimited files when delimiter='\\t' is specified. Parameters ---------- fp : file-like object Open file pointer to CSV file (from open() or similar). For encoding detection, use ``get_reader('file.csv', encoding='detect')`` instead of opening the file directly. dialect : csv.Dialect, default csv.excel CSV dialect defining formatting rules. Common options: * ``csv.excel`` - Standard CSV format (comma delimiter, quoted strings) * ``csv.excel_tab`` - Tab-delimited format * ``csv.unix_dialect`` - Unix-style CSV (LF line endings) headers : List[str], optional Custom header names to use instead of reading from first row. Useful when CSV has no header row or you want to rename columns. add_rownum : bool, default True Add '_row_num' field to each record with 1-based row number skip_records : int, default 0 Number of data rows to skip after headers max_records : int, optional Maximum records to read, None for all **kwargs Additional arguments passed to csv.reader() like delimiter, quotechar, etc. Example ------- :: from dbtk import readers # Basic CSV reading with readers.CSVReader(open('users.csv')) as reader: for user in reader: print(f"{user.name}: {user.email}") # Tab-delimited file with readers.CSVReader(open('data.tsv'), delimiter='\\t') as reader: for record in reader: process(record) # Custom delimiter and quoting with readers.CSVReader(open('data.txt'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) as reader: for record in reader: print(record) # Provide custom headers (file has no header row) headers = ['id', 'name', 'email', 'created'] with readers.CSVReader(open('data.csv'), headers=headers) as reader: for record in reader: print(record.id, record.name) # Skip first 10 data rows, read only 100 rows with readers.CSVReader(open('large.csv'), skip_rows=10, n_rows=100) as reader: data = list(reader) See Also -------- Reader : Base class with common reader features readers.get_reader : Automatic reader selection based on file extension Notes ----- * Automatically converts '\\t' delimiter to excel_tab dialect * Headers are read from first row unless custom headers provided * File pointer is automatically closed when used as context manager """
[docs] def __init__(self, fp: TextIO, dialect=csv.excel, headers: Optional[List[str]] = None, add_row_num: bool = True, skip_rows: int = 0, n_rows: Optional[int] = None, null_values=None, **kwargs): """ Initialize CSV reader for a file. Parameters ---------- fp : file-like object Open file pointer to CSV file dialect : csv.Dialect, default csv.excel CSV dialect (excel, excel_tab, unix_dialect, etc.) headers : List[str], optional Custom headers to use instead of reading from file add_row_num : bool, default True Add _row_num field to records skip_rows : int, default 0 Data rows to skip after headers n_rows : int, optional Maximum rows to read null_values : str, list, tuple, or set, optional Values to convert to None (e.g., '\\N' for IMDB files) **kwargs Additional csv.reader() arguments (delimiter, quotechar, etc.) """ if kwargs.get('delimiter') == '\t' and dialect == csv.excel: dialect = csv.excel_tab kwargs.pop('delimiter') super().__init__(add_row_num=add_row_num, skip_rows=skip_rows, n_rows=n_rows, headers=headers, null_values=null_values) self.fp = fp if hasattr(fp, 'encoding') and fp.encoding == 'utf-8': # Using the standard utf-8 encoding can cause issues with BOM headers in column names logger.warning("utf-8 encoding detected. Consider using 'utf-8-sig' encoding instead.") # Set trackable for progress tracking if hasattr(fp, '_uncompressed_size'): # Compressed file - use buffer's tell() but preserve _uncompressed_size self._trackable = fp.buffer self._trackable._uncompressed_size = fp._uncompressed_size elif hasattr(fp, 'buffer'): # Text mode file - use buffer for better performance self._trackable = fp.buffer try: self._trackable._uncompressed_size = os.fstat(self._trackable.fileno()).st_size except (AttributeError, OSError): pass else: # Binary mode or other file type self._trackable = fp try: self._trackable._uncompressed_size = os.fstat(self._trackable.fileno()).st_size except (AttributeError, OSError): pass self._rdr = csv.reader(fp, dialect=dialect, **kwargs) self._headers_read = False
def _read_headers(self) -> List[str]: """Read the header row from the CSV file or use provided headers. Returns: List of header values. Raises: StopIteration: If the file is empty and no headers are provided. """ if self._raw_headers is not None: self._headers_read = True return self._raw_headers if not self._headers_read: try: self._raw_headers = next(self._rdr) self._headers_read = True except StopIteration: self._raw_headers = [] self._headers_read = True raise ValueError("Empty CSV file") return self._raw_headers def _generate_rows(self) -> Iterator[List[Any]]: """Yield data rows from the CSV file, skipping _start_row number of rows. Yields: List of values for each data row. """ # Yield remaining rows yield from self._rdr def _cleanup(self): """Close the file pointer.""" if self.fp and hasattr(self.fp, 'close'): self.fp.close()