Source code for lector.types.regex

"""Common regex patterns used in mutiple modules."""

[docs] RE_INT_SIGN = "^(?P<sign>[+-])?(?P<num>[0-9]+)$"
"""Capture optional sign and numeric parts in integer strings."""
[docs] RE_IS_INT = r"^\+?\-?[0-9]+$"
"""Strings matching int representations we're able to parse."""
[docs] RE_IS_FLOAT = "^[-]?[0-9]*[.]?[0-9]*(?:[e][+-]?[0-9]+)?$"
"""Strings matching float representations convertable by Arrow. Allows ints too, but those should have been inferred before trying floats. """
[docs] RE_LIST_LIKE: str = r"^[\(\[\|][\s\S]*[\)\]\|]$"
"""Lists start and end with parenthesis-like characters."""
[docs] RE_LIST_CLEAN: str = r"^[\[\{\(\|<]|[\]\}\)\|>]$|\r?\n"
"""Remove all parenthesis-like characters from start and end as well as line breaks."""
[docs] RE_URL = ( r"^(http://www\.|https://www\.|http://|https://)?" # http:// or https:// r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|" # domain... r"localhost|" # localhost... r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip r"(?::\d+)?" # optional port r"(?:/?|[/?]\S+)$" )
[docs] RE_TRAILING_DECIMALS: str = r"\.(\d+)$"
"""Strictly trailing, i.e. nothing after the decimals."""
[docs] RE_FRATIONAL_SECONDS: str = r"(?P<frac>\.\d+)"
"""Allows for timezone after fractional seconds, capturing part to be replaced."""
[docs] RE_TZ_OFFSET: str = r"(?P<tz>Z|[+-]\d{2}:?\d{2}$)"
"""Detect 4 digit timezone offsets, with or without colon."""