vkit.utility.text.const.chinese
Consts for detecting chinese chars.
1# Copyright 2022 vkit-x Administrator. All Rights Reserved. 2# 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses. 4# 5# The commercial license gives you the full rights to create and distribute software 6# on your own terms without any SSPL license obligations. For more information, 7# please see the "LICENSE_COMMERCIAL.txt" file. 8# 9# This project is also available under Server Side Public License (SSPL). 10# The SSPL licensing is ideal for use cases such as open source projects with 11# SSPL distribution, student/academic purposes, hobby projects, internal research 12# projects without external distribution, or other projects where all SSPL 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file. 14''' 15Consts for detecting chinese chars. 16''' 17from typing import Sequence, Tuple 18 19#: Chinese Chars. 20#: Pulled from https://www.qqxiuzi.cn/zh/hanzi-unicode-bianma.php 21#: 22#: Range generation:: 23#: 24#: lines = '''copy paste the table here''' 25#: [l.split('\t') for l in lines.strip().split('\n')] 26ITV_CHINESE: Sequence[Sequence[Tuple[int, int]]] = [[ 27 (0x4E00, 0x9FA5), 28 (0x9FA6, 0x9FEF), 29 (0x3400, 0x4DB5), 30 (0x20000, 0x2A6D6), 31 (0x2A700, 0x2B734), 32 (0x2B740, 0x2B81D), 33 (0x2B820, 0x2CEA1), 34 (0x2CEB0, 0x2EBE0), 35 (0x2F00, 0x2FD5), 36 (0x2E80, 0x2EF3), 37 (0xF900, 0xFAD9), 38 (0x2F800, 0x2FA1D), 39 (0xE815, 0xE86F), 40 (0xE400, 0xE5E8), 41 (0xE600, 0xE6CF), 42 (0x3007, 0x3007), 43 (0x31C0, 0x31E3), 44 (0x2FF0, 0x2FFB), 45 (0x3105, 0x312F), 46 (0x31A0, 0x31BA), 47]] 48 49# From vkit.text.lexicon.builder:extract_radicals_from_unihan 50CHINESE_RADICAL = { 51 # Add manually. 52 '彳', 53 '丬', 54 '乚', 55 '丄', 56 '乛', 57 # From build_black_list_from_unihan: 58 '㓁', 59 '㔾', 60 '㣺', 61 # '䑕', 62 # '业', 63 '丨', 64 '丶', 65 '丿', 66 '亅', 67 '亠', 68 '亻', 69 # '儿', 70 '冂', 71 '冖', 72 '冫', 73 '凵', 74 '刂', 75 '勹', 76 '匚', 77 '匸', 78 '卩', 79 # '厂', 80 '厶', 81 '夂', 82 '夊', 83 '宀', 84 # '宍', 85 # '寸', 86 # '尢', 87 # '尣', 88 # '尸', 89 # '屮', # COMMONLY USED. 90 '巛', 91 # '巾', 92 '廴', 93 # '廾', 94 # '弓', 95 '彐', 96 '彑', 97 '彡', 98 '忄', 99 '扌', 100 # '攴', 101 '攵', 102 # '斤', 103 # '无', 104 # '曰', 105 # '月', 106 # '欠', 107 # '歯', 108 # '气', 109 '氵', 110 '灬', 111 '爫', 112 # '父', 113 '爿', 114 # '牛', 115 '牜', 116 # '犬', 117 '犭', 118 # '玄', 119 '疒', 120 '癶', 121 # '矛', 122 '礻', 123 '禸', 124 # '穴', 125 # '竹', 126 # '糸', 127 '糹', 128 '纟', 129 # '网', 130 '罒', 131 # '羊', 132 '耂', 133 '肀', 134 # '肉', 135 # '肭', 136 # '舌', 137 # '舟', 138 # '艸', 139 # '艹', # COMMONLY USED. 140 '虍', 141 # '虫', 142 # '血', 143 '衤', 144 '襾', 145 '覀', 146 '訁', 147 '讠', 148 # '豕', 149 # '豸', 150 # '辵', 151 '辶', 152 # '釆', 153 '釒', 154 '钅', 155 '阝', 156 # '隶', 157 # '隹', 158 # '雨', 159 # '韋', 160 # '韦', 161 # '韭', 162 # '食', 163 '飠', 164 '饣', 165 # '馬', 166 # '马', 167 # '髟', 168 # '鬲', 169 # '魚', 170 # '鱼', 171 # '鳥', 172 # '鸟', 173 # '鹿', 174 # '麥', 175 # '麦', 176 # '黍', 177 # '黹', 178 # '鼠', 179 # '鼻', 180 # '齒', 181 # '齿', 182 # '鹿', 183}