vkit.utility.text.opt
1# Copyright 2022 vkit-x Administrator. All Rights Reserved. 2# 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses. 4# 5# The commercial license gives you the full rights to create and distribute software 6# on your own terms without any SSPL license obligations. For more information, 7# please see the "LICENSE_COMMERCIAL.txt" file. 8# 9# This project is also available under Server Side Public License (SSPL). 10# The SSPL licensing is ideal for use cases such as open source projects with 11# SSPL distribution, student/academic purposes, hobby projects, internal research 12# projects without external distribution, or other projects where all SSPL 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file. 14from typing import Sequence, Tuple 15from enum import Enum, unique 16import itertools 17import unicodedata 18 19import intervaltree 20 21from .const import ( 22 cjk_compatibility_ideograph, 23 chinese, 24 english, 25 delimiter, 26 digit, 27 whitespace, 28) 29 30 31def normalize_cjk_fullwidth(text: str): 32 return unicodedata.normalize('NFKC', text) 33 34 35def normalize_cjk_compatibility_ideograph(text: str): 36 code_points = (ord(char) for char in text) 37 normalized_code_points = ( 38 cjk_compatibility_ideograph.CJK_COMPATIBILITY_IDEOGRAPH.get(code_point, code_point) 39 for code_point in code_points 40 ) 41 return ''.join(map(chr, normalized_code_points)) 42 43 44def normalize(text: str): 45 text = normalize_cjk_fullwidth(text) 46 text = normalize_cjk_compatibility_ideograph(text) 47 return text 48 49 50@unique 51class LexiconType(Enum): 52 CHINESE = 'chinese' 53 ENGLISH = 'english' 54 DELIMITER = 'delimiter' 55 DIGIT = 'digit' 56 WHITESPACE = 'whitespace' 57 UNKNOWN = 'unknown' 58 59 60def add_intervals( 61 itv_tree: intervaltree.IntervalTree, 62 nested_intervals: Sequence[Sequence[Tuple[int, int]]], 63 lexicon_type: LexiconType, 64): 65 intervals = itertools.chain.from_iterable(nested_intervals) 66 for begin, end in intervals: 67 # NOTE: adding one since the interval is inclusive. 68 itv_tree.addi(begin, end + 1, lexicon_type) 69 70 71def _build_itv_tree_lexicon_type(): 72 itv_tree = intervaltree.IntervalTree() 73 74 add_intervals(itv_tree, chinese.ITV_CHINESE, LexiconType.CHINESE) 75 add_intervals(itv_tree, english.ITV_ENGLISH, LexiconType.ENGLISH) 76 add_intervals(itv_tree, digit.ITV_DIGIT, LexiconType.DIGIT) 77 add_intervals(itv_tree, delimiter.ITV_DELIMITER, LexiconType.DELIMITER) 78 add_intervals(itv_tree, whitespace.ITV_WHITESPACE, LexiconType.WHITESPACE) 79 80 # Make sure there's no overlap. 81 sorted_intervals = sorted(itv_tree, key=lambda itv: itv.begin) # type: ignore 82 idx = 1 83 while idx < len(sorted_intervals): 84 assert sorted_intervals[idx - 1].end <= sorted_intervals[idx].begin # type: ignore 85 idx += 1 86 87 return itv_tree 88 89 90_itv_tree_lexicon_type = _build_itv_tree_lexicon_type() 91 92 93def get_lexicon_type(char: str): 94 lexicon_types = _itv_tree_lexicon_type[ord(char)] 95 if not lexicon_types: 96 return LexiconType.UNKNOWN 97 else: 98 assert len(lexicon_types) == 1 99 return next(iter(lexicon_types)).data
def
normalize_cjk_fullwidth(text: str):
def
normalize_cjk_compatibility_ideograph(text: str):
36def normalize_cjk_compatibility_ideograph(text: str): 37 code_points = (ord(char) for char in text) 38 normalized_code_points = ( 39 cjk_compatibility_ideograph.CJK_COMPATIBILITY_IDEOGRAPH.get(code_point, code_point) 40 for code_point in code_points 41 ) 42 return ''.join(map(chr, normalized_code_points))
def
normalize(text: str):
class
LexiconType(enum.Enum):
52class LexiconType(Enum): 53 CHINESE = 'chinese' 54 ENGLISH = 'english' 55 DELIMITER = 'delimiter' 56 DIGIT = 'digit' 57 WHITESPACE = 'whitespace' 58 UNKNOWN = 'unknown'
An enumeration.
CHINESE =
<LexiconType.CHINESE: 'chinese'>
ENGLISH =
<LexiconType.ENGLISH: 'english'>
DELIMITER =
<LexiconType.DELIMITER: 'delimiter'>
DIGIT =
<LexiconType.DIGIT: 'digit'>
WHITESPACE =
<LexiconType.WHITESPACE: 'whitespace'>
UNKNOWN =
<LexiconType.UNKNOWN: 'unknown'>
Inherited Members
- enum.Enum
- name
- value
def
add_intervals( itv_tree: intervaltree.intervaltree.IntervalTree, nested_intervals: Sequence[Sequence[Tuple[int, int]]], lexicon_type: vkit.utility.text.opt.LexiconType):
61def add_intervals( 62 itv_tree: intervaltree.IntervalTree, 63 nested_intervals: Sequence[Sequence[Tuple[int, int]]], 64 lexicon_type: LexiconType, 65): 66 intervals = itertools.chain.from_iterable(nested_intervals) 67 for begin, end in intervals: 68 # NOTE: adding one since the interval is inclusive. 69 itv_tree.addi(begin, end + 1, lexicon_type)
def
get_lexicon_type(char: str):