Source code for lector.csv.dialects

"""Detectors of CSV dialects (separator, quoting etc.).

Note that python.csv is not even internally consistent. E.g. although the dialect used to produce a
CSV may specify ``\\n`` as the line terminator, the python sniffer is hard-coded to return
``\\r\\n`` (it doesn't actually support detecting it). It's own reader (and others hopefully) deal
internally with different line breaks, but it means one cannot compare a dialect used to generate a
CSV and a dialect created by sniffing the same (quoting is equally hard-coded to ``QUOTE_MINIMAL``).

Python quoting levels:

- ``QUOTE_ALL``: 1
- ``QUOTE_MINIMAL``: 0
- ``QUOTE_NONE``: 3
- ``QUOTE_NONNUMERIC``: 2

"""
from __future__ import annotations

from abc import ABC, abstractmethod
from collections.abc import Iterable
from contextlib import suppress
from csv import QUOTE_MINIMAL, QUOTE_NONE, Sniffer, get_dialect
from csv import Dialect as PyDialect
from dataclasses import dataclass
from itertools import islice
from typing import TextIO

from ..log import LOG

try:
    import clevercsv as ccsv

[docs] CLEVER_CSV = True
except Exception: CLEVER_CSV = False
[docs] PyDialectT = type(PyDialect)
[docs] N_ROWS_DFAULT: int = 100
"""How many rows to use for dialect detection."""
[docs] DELIMITER_OPTIONS: tuple[str] = (",", ";", "\t", "|")
"""Allowed delimiters for dialect detection.""" @dataclass
[docs] class Dialect: """A more convenient class for dialects than Python's built-in. The built-in Dialect is a class with class attributes only, and so instead of instances of that class, Python wants you to send references to subclasses around, which is, uhm, awkward to say the least (see below _to_builtin() for an example). """
[docs] delimiter: str = ","
[docs] quote_char: str = '"'
[docs] escape_char: str | None = None
[docs] double_quote: bool = True
[docs] skip_initial_space: bool = False
[docs] line_terminator: str = "\r\n" # Default in Python and correct according to official spec
[docs] quoting: int = QUOTE_MINIMAL
@classmethod
[docs] def from_builtin(cls, dialect: str | PyDialectT) -> Dialect: """Make instance from built-in dialect class configured for reliable reading(!).""" if isinstance(dialect, str): dialect = get_dialect(dialect) # A dialect without delimiter doesn't make sense, though CleverCSV may return one, # e.g. when a CSV file contains a single column only delimiter = dialect.delimiter or "," # To read reliably we need one of escape_char or double quote defined double_quote = dialect.doublequote or (dialect.escapechar is None) # Although most parsers ignore this, Python's csv module complains when its missing line_terminator = dialect.lineterminator or "\r\n" # Minimal quoting won't hurt and is sensible if we already know how quoting is used quoting = dialect.quoting if quoting == QUOTE_NONE and (dialect.quotechar is not None or dialect.doublequote): quoting = QUOTE_MINIMAL return Dialect( delimiter=delimiter, quote_char=dialect.quotechar, escape_char=dialect.escapechar, double_quote=double_quote, skip_initial_space=dialect.skipinitialspace, line_terminator=line_terminator, quoting=quoting, )
[docs] def to_builtin(self) -> PyDialectT: """Make a subclass of built-in Dialect from this instance.""" class _Dialect(PyDialect): _name = "generated" lineterminator = self.line_terminator quoting = self.quoting escapechar = self.escape_char or None doublequote = self.double_quote delimiter = self.delimiter quotechar = self.quote_char skipinitialspace = self.skip_initial_space strict = False return _Dialect
@dataclass
[docs] class DialectDetector(ABC): """Base class for all dialect detectors.""" @abstractmethod
[docs] def detect(self, buffer: TextIO) -> Dialect: ...
@dataclass
[docs] class PySniffer(DialectDetector): """Use Python's built-in csv sniffer."""
[docs] delimiters: Iterable[str] = DELIMITER_OPTIONS
[docs] n_rows: int = N_ROWS_DFAULT
[docs] log: bool = False
[docs] def detect(self, buffer: TextIO) -> Dialect: """Detect a dialect we can read(!) a CSV with using the python sniffer. Note that the sniffer is not reliable for detecting quoting, quotechar etc., but reasonable defaults are almost guaranteed to work with most parsers. E.g. the lineterminator is not even configurable in pyarrow's csv reader, nor in pandas (python engine). """ pos = buffer.tell() sniffer = Sniffer() sniffer.preferred = [] for n_rows in (self.n_rows, 1): with suppress(Exception): buffer.seek(pos) sample = "\n".join(islice(buffer, n_rows)) dialect = sniffer.sniff(sample, delimiters=self.delimiters) # To read(!) a CSV reliably, we must have either doublequote=True or an escapechar, # yet Python's sniffer may return doublequote=False and no escapechar if nothing # was escaped in any way in the given CSV. dialect.doublequote = dialect.escapechar is None # The lineterminator is always returned as "\r\n", but that's ok since parsers # tend to ignore it anyways # dialect.lineterminator = ... # noqa # May detect that sample has no quotes, but if correct, parsing with minimal quote # option will still work, and if detection was erroneous, assuming minimal quoting # is more robust. It's also the default in pandas (=0) and arrow ignores it. if dialect.quoting == QUOTE_NONE: dialect.quoting = QUOTE_MINIMAL return Dialect.from_builtin(dialect) if self.log: LOG.info("Falling back to default dialect...") return Dialect()
if CLEVER_CSV: # CleverCSV may return non-sensical characters as escapechar. # Monkey-patch to at least limit to ASCII chars.
[docs] is_potential_escapechar_orig = ccsv.escape.is_potential_escapechar
def is_potential_escapechar(char, encoding, block_char=None): if not char.isascii(): return False return is_potential_escapechar_orig(char, encoding, block_char) ccsv.escape.is_potential_escapechar = is_potential_escapechar ccsv.potential_dialects.is_potential_escapechar = is_potential_escapechar ccsv.normal_form.is_potential_escapechar = is_potential_escapechar @dataclass class CleverCSV(DialectDetector): """A more advanced dialect detector using CleverCsv.""" num_chars: int = int(1e6) skip: bool = True method: str = "auto" verbose: bool = False def detect(self, buffer: TextIO) -> Dialect: text = buffer.read(self.num_chars) dialect = ccsv.Detector().detect( text, delimiters=DELIMITER_OPTIONS, verbose=self.verbose, method=self.method, skip=self.skip, ) return Dialect.from_builtin(dialect.to_csv_dialect())