vkit.utility.text.opt

 1# Copyright 2022 vkit-x Administrator. All Rights Reserved.
 2#
 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses.
 4#
 5# The commercial license gives you the full rights to create and distribute software
 6# on your own terms without any SSPL license obligations. For more information,
 7# please see the "LICENSE_COMMERCIAL.txt" file.
 8#
 9# This project is also available under Server Side Public License (SSPL).
10# The SSPL licensing is ideal for use cases such as open source projects with
11# SSPL distribution, student/academic purposes, hobby projects, internal research
12# projects without external distribution, or other projects where all SSPL
13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file.
14from typing import Sequence, Tuple
15from enum import Enum, unique
16import itertools
17import unicodedata
18
19import intervaltree
20
21from .const import (
22    cjk_compatibility_ideograph,
23    chinese,
24    english,
25    delimiter,
26    digit,
27    whitespace,
28)
29
30
31def normalize_cjk_fullwidth(text: str):
32    return unicodedata.normalize('NFKC', text)
33
34
35def normalize_cjk_compatibility_ideograph(text: str):
36    code_points = (ord(char) for char in text)
37    normalized_code_points = (
38        cjk_compatibility_ideograph.CJK_COMPATIBILITY_IDEOGRAPH.get(code_point, code_point)
39        for code_point in code_points
40    )
41    return ''.join(map(chr, normalized_code_points))
42
43
44def normalize(text: str):
45    text = normalize_cjk_fullwidth(text)
46    text = normalize_cjk_compatibility_ideograph(text)
47    return text
48
49
50@unique
51class LexiconType(Enum):
52    CHINESE = 'chinese'
53    ENGLISH = 'english'
54    DELIMITER = 'delimiter'
55    DIGIT = 'digit'
56    WHITESPACE = 'whitespace'
57    UNKNOWN = 'unknown'
58
59
60def add_intervals(
61    itv_tree: intervaltree.IntervalTree,
62    nested_intervals: Sequence[Sequence[Tuple[int, int]]],
63    lexicon_type: LexiconType,
64):
65    intervals = itertools.chain.from_iterable(nested_intervals)
66    for begin, end in intervals:
67        # NOTE: adding one since the interval is inclusive.
68        itv_tree.addi(begin, end + 1, lexicon_type)
69
70
71def _build_itv_tree_lexicon_type():
72    itv_tree = intervaltree.IntervalTree()
73
74    add_intervals(itv_tree, chinese.ITV_CHINESE, LexiconType.CHINESE)
75    add_intervals(itv_tree, english.ITV_ENGLISH, LexiconType.ENGLISH)
76    add_intervals(itv_tree, digit.ITV_DIGIT, LexiconType.DIGIT)
77    add_intervals(itv_tree, delimiter.ITV_DELIMITER, LexiconType.DELIMITER)
78    add_intervals(itv_tree, whitespace.ITV_WHITESPACE, LexiconType.WHITESPACE)
79
80    # Make sure there's no overlap.
81    sorted_intervals = sorted(itv_tree, key=lambda itv: itv.begin)  # type: ignore
82    idx = 1
83    while idx < len(sorted_intervals):
84        assert sorted_intervals[idx - 1].end <= sorted_intervals[idx].begin  # type: ignore
85        idx += 1
86
87    return itv_tree
88
89
90_itv_tree_lexicon_type = _build_itv_tree_lexicon_type()
91
92
93def get_lexicon_type(char: str):
94    lexicon_types = _itv_tree_lexicon_type[ord(char)]
95    if not lexicon_types:
96        return LexiconType.UNKNOWN
97    else:
98        assert len(lexicon_types) == 1
99        return next(iter(lexicon_types)).data
def normalize_cjk_fullwidth(text: str):
32def normalize_cjk_fullwidth(text: str):
33    return unicodedata.normalize('NFKC', text)
def normalize_cjk_compatibility_ideograph(text: str):
36def normalize_cjk_compatibility_ideograph(text: str):
37    code_points = (ord(char) for char in text)
38    normalized_code_points = (
39        cjk_compatibility_ideograph.CJK_COMPATIBILITY_IDEOGRAPH.get(code_point, code_point)
40        for code_point in code_points
41    )
42    return ''.join(map(chr, normalized_code_points))
def normalize(text: str):
45def normalize(text: str):
46    text = normalize_cjk_fullwidth(text)
47    text = normalize_cjk_compatibility_ideograph(text)
48    return text
class LexiconType(enum.Enum):
52class LexiconType(Enum):
53    CHINESE = 'chinese'
54    ENGLISH = 'english'
55    DELIMITER = 'delimiter'
56    DIGIT = 'digit'
57    WHITESPACE = 'whitespace'
58    UNKNOWN = 'unknown'

An enumeration.

CHINESE = <LexiconType.CHINESE: 'chinese'>
ENGLISH = <LexiconType.ENGLISH: 'english'>
DELIMITER = <LexiconType.DELIMITER: 'delimiter'>
DIGIT = <LexiconType.DIGIT: 'digit'>
WHITESPACE = <LexiconType.WHITESPACE: 'whitespace'>
UNKNOWN = <LexiconType.UNKNOWN: 'unknown'>
Inherited Members
enum.Enum
name
value
def add_intervals( itv_tree: intervaltree.intervaltree.IntervalTree, nested_intervals: Sequence[Sequence[Tuple[int, int]]], lexicon_type: vkit.utility.text.opt.LexiconType):
61def add_intervals(
62    itv_tree: intervaltree.IntervalTree,
63    nested_intervals: Sequence[Sequence[Tuple[int, int]]],
64    lexicon_type: LexiconType,
65):
66    intervals = itertools.chain.from_iterable(nested_intervals)
67    for begin, end in intervals:
68        # NOTE: adding one since the interval is inclusive.
69        itv_tree.addi(begin, end + 1, lexicon_type)
def get_lexicon_type(char: str):
 94def get_lexicon_type(char: str):
 95    lexicon_types = _itv_tree_lexicon_type[ord(char)]
 96    if not lexicon_types:
 97        return LexiconType.UNKNOWN
 98    else:
 99        assert len(lexicon_types) == 1
100        return next(iter(lexicon_types)).data