vkit.utility.text.const.delimiter

Consts for detecting delimiter chars.

 1# Copyright 2022 vkit-x Administrator. All Rights Reserved.
 2#
 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses.
 4#
 5# The commercial license gives you the full rights to create and distribute software
 6# on your own terms without any SSPL license obligations. For more information,
 7# please see the "LICENSE_COMMERCIAL.txt" file.
 8#
 9# This project is also available under Server Side Public License (SSPL).
10# The SSPL licensing is ideal for use cases such as open source projects with
11# SSPL distribution, student/academic purposes, hobby projects, internal research
12# projects without external distribution, or other projects where all SSPL
13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file.
14'''
15Consts for detecting delimiter chars.
16'''
17from typing import Sequence, Tuple
18
19#: Delimiters.
20ITV_DELIMITER: Sequence[Sequence[Tuple[int, int]]] = [
21    # ASCII_DELIMITERS_RANGES
22    [
23        (0x0021, 0x002F),
24        (0x003A, 0x0040),
25        (0x005B, 0x0060),
26        (0x007B, 0x007E),
27        # ¢, £, ¤, ¥
28        (0x00A2, 0x00A5),
29    ],
30    [
31        # Pick from the whitespace category.
32        (0xB7, 0xB7)
33    ],
34
35    # GENERAL_DELIMITERS_RAGES
36    # http://www.unicode.org/charts/PDF/U2000.pdf
37    [
38        # (0x2000, 0x206F),
39        # Fix with:
40        (0x2010, 0x2027),
41        (0x202D, 0x202E),
42        (0x2030, 0x205E),
43    ],
44    # CJK_DELIMITERS_RANGES
45    # http://www.unicode.org/charts/PDF/U3000.pdf
46    # http://www.unicode.org/charts/PDF/UFE30.pdf
47    [
48        # (0x3000, 0x303F),
49        # Fix with:
50        (0x3001, 0x3006),
51        (0x3008, 0x303F),
52        (0xFE30, 0xFE4F),
53    ],
54    # DELIMITERS_EXTENSION_RANGES
55    # http://www.unicode.org/charts/PDF/UFF00.pdf
56    [
57        (0xFF01, 0xFF0F),
58        (0xFF1A, 0xFF20),
59        (0xFF3B, 0xFF40),
60        (0xFF5B, 0xFF64),
61        (0xFFE0, 0xFFEE),
62    ],
63]
64
65DELIMITER_BLACKLIST = {
66    '々',
67    '〓',
68    "〒",
69    '〆',
70}