vkit.utility.text.const.chinese

Consts for detecting chinese chars.

  1# Copyright 2022 vkit-x Administrator. All Rights Reserved.
  2#
  3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses.
  4#
  5# The commercial license gives you the full rights to create and distribute software
  6# on your own terms without any SSPL license obligations. For more information,
  7# please see the "LICENSE_COMMERCIAL.txt" file.
  8#
  9# This project is also available under Server Side Public License (SSPL).
 10# The SSPL licensing is ideal for use cases such as open source projects with
 11# SSPL distribution, student/academic purposes, hobby projects, internal research
 12# projects without external distribution, or other projects where all SSPL
 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file.
 14'''
 15Consts for detecting chinese chars.
 16'''
 17from typing import Sequence, Tuple
 18
 19#: Chinese Chars.
 20#: Pulled from https://www.qqxiuzi.cn/zh/hanzi-unicode-bianma.php
 21#:
 22#: Range generation::
 23#:
 24#:  lines = '''copy paste the table here'''
 25#:  [l.split('\t') for l in lines.strip().split('\n')]
 26ITV_CHINESE: Sequence[Sequence[Tuple[int, int]]] = [[
 27    (0x4E00, 0x9FA5),
 28    (0x9FA6, 0x9FEF),
 29    (0x3400, 0x4DB5),
 30    (0x20000, 0x2A6D6),
 31    (0x2A700, 0x2B734),
 32    (0x2B740, 0x2B81D),
 33    (0x2B820, 0x2CEA1),
 34    (0x2CEB0, 0x2EBE0),
 35    (0x2F00, 0x2FD5),
 36    (0x2E80, 0x2EF3),
 37    (0xF900, 0xFAD9),
 38    (0x2F800, 0x2FA1D),
 39    (0xE815, 0xE86F),
 40    (0xE400, 0xE5E8),
 41    (0xE600, 0xE6CF),
 42    (0x3007, 0x3007),
 43    (0x31C0, 0x31E3),
 44    (0x2FF0, 0x2FFB),
 45    (0x3105, 0x312F),
 46    (0x31A0, 0x31BA),
 47]]
 48
 49# From vkit.text.lexicon.builder:extract_radicals_from_unihan
 50CHINESE_RADICAL = {
 51    # Add manually.
 52    '彳',
 53    '丬',
 54    '乚',
 55    '丄',
 56    '乛',
 57    # From build_black_list_from_unihan:
 58    '㓁',
 59    '㔾',
 60    '㣺',
 61    # '䑕',
 62    # '业',
 63    '丨',
 64    '丶',
 65    '丿',
 66    '亅',
 67    '亠',
 68    '亻',
 69    # '儿',
 70    '冂',
 71    '冖',
 72    '冫',
 73    '凵',
 74    '刂',
 75    '勹',
 76    '匚',
 77    '匸',
 78    '卩',
 79    # '厂',
 80    '厶',
 81    '夂',
 82    '夊',
 83    '宀',
 84    # '宍',
 85    # '寸',
 86    # '尢',
 87    # '尣',
 88    # '尸',
 89    # '屮',  # COMMONLY USED.
 90    '巛',
 91    # '巾',
 92    '廴',
 93    # '廾',
 94    # '弓',
 95    '彐',
 96    '彑',
 97    '彡',
 98    '忄',
 99    '扌',
100    # '攴',
101    '攵',
102    # '斤',
103    # '无',
104    # '曰',
105    # '月',
106    # '欠',
107    # '歯',
108    # '气',
109    '氵',
110    '灬',
111    '爫',
112    # '父',
113    '爿',
114    # '牛',
115    '牜',
116    # '犬',
117    '犭',
118    # '玄',
119    '疒',
120    '癶',
121    # '矛',
122    '礻',
123    '禸',
124    # '穴',
125    # '竹',
126    # '糸',
127    '糹',
128    '纟',
129    # '网',
130    '罒',
131    # '羊',
132    '耂',
133    '肀',
134    # '肉',
135    # '肭',
136    # '舌',
137    # '舟',
138    # '艸',
139    # '艹',  # COMMONLY USED.
140    '虍',
141    # '虫',
142    # '血',
143    '衤',
144    '襾',
145    '覀',
146    '訁',
147    '讠',
148    # '豕',
149    # '豸',
150    # '辵',
151    '辶',
152    # '釆',
153    '釒',
154    '钅',
155    '阝',
156    # '隶',
157    # '隹',
158    # '雨',
159    # '韋',
160    # '韦',
161    # '韭',
162    # '食',
163    '飠',
164    '饣',
165    # '馬',
166    # '马',
167    # '髟',
168    # '鬲',
169    # '魚',
170    # '鱼',
171    # '鳥',
172    # '鸟',
173    # '鹿',
174    # '麥',
175    # '麦',
176    # '黍',
177    # '黹',
178    # '鼠',
179    # '鼻',
180    # '齒',
181    # '齿',
182    # '鹿',
183}