Source code for lector.csv.encodings
"""Helpers to detecting character encodings in binary buffers."""
from __future__ import annotations
import codecs
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import BinaryIO, Literal
import cchardet as cdet
[docs]
BOMS: dict[str, tuple[Literal, ...]] = {
"utf-8-sig": (codecs.BOM_UTF8,),
"utf-16": (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE),
"utf-32": (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE),
}
"""Map BOM (Byte-order mark) to encoding."""
[docs]
MAX_INT32: int = 2_147_483_647
"""Cannot read more than this number of bytes at once to detect encoding."""
"""Character representing non-codable bytes."""
[docs]
def detect_bom(bs: bytes):
"""Detect encoding by looking for a BOM at the start of the file."""
for enc, boms in BOMS.items():
if any(bs.startswith(bom) for bom in boms):
return enc
return None
[docs]
def decoding_errors(bs: bytes, encoding: str, prop: bool = True) -> float:
"""The proportion of characters that couldn't be decoded correctly."""
string = bytes.decode(bs, encoding, errors="replace")
err = string.count(CODEC_ERR_CHAR) / (len(string) if prop else 1.0)
return err
@dataclass
[docs]
class EncodingDetector(ABC):
"""Base class specifying interface for all encoding detetors."""
@abstractmethod
[docs]
def detect(self, buffer: BinaryIO) -> str:
"""Implement me."""
@dataclass
[docs]
class Chardet(EncodingDetector):
"""An encoding detector using cchardet if the default utf-8 generates too many errors."""
[docs]
n_bytes: int = int(1e7) # 10 MB
"""Use this many bytes to detect encoding."""
[docs]
error_threshold: float = 0.001
"""A greater proportion of decoding errors than this will be considered a failed encoding."""
[docs]
confidence_threshold: float = 0.6
"""Minimum level of confidence to accept an encoding automatically detected by cchardet."""
[docs]
def detect(self, buffer: BinaryIO) -> str:
"""Somewhat 'opinionated' encoding detection.
Assumes utf-8 as most common encoding, falling back on cchardet detection, and
if all else fails on windows-1250 if encoding is latin-like.
"""
head: bytes = buffer.read(min(self.n_bytes, MAX_INT32))
bom_encoding = detect_bom(head)
if bom_encoding:
return bom_encoding
if decoding_errors(head, "utf-8", prop=True) <= self.error_threshold:
return "utf-8"
detected = cdet.detect(head)
encoding, confidence = detected["encoding"], detected["confidence"]
if encoding:
if confidence > self.confidence_threshold:
return encoding
if any(label in encoding.lower() for label in ("windows", "iso-8859")):
# Iso-like, will use windows-1250 as super set for special chars
return "windows-1250"
return "windows-1250"