vkit.element.lexicon

  1# Copyright 2022 vkit-x Administrator. All Rights Reserved.
  2#
  3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses.
  4#
  5# The commercial license gives you the full rights to create and distribute software
  6# on your own terms without any SSPL license obligations. For more information,
  7# please see the "LICENSE_COMMERCIAL.txt" file.
  8#
  9# This project is also available under Server Side Public License (SSPL).
 10# The SSPL licensing is ideal for use cases such as open source projects with
 11# SSPL distribution, student/academic purposes, hobby projects, internal research
 12# projects without external distribution, or other projects where all SSPL
 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file.
 14from typing import Mapping, Sequence, Optional, DefaultDict, List
 15from collections import defaultdict
 16import hashlib
 17
 18import attrs
 19import cattrs
 20import iolite as io
 21
 22from vkit.utility import attrs_lazy_field, unwrap_optional_field, dyn_structure, PathType
 23
 24
 25@attrs.define(frozen=True)
 26class Lexicon:
 27    char: str
 28    aliases: Sequence[str] = attrs.field(factory=tuple)
 29    tags: Sequence[str] = attrs.field(factory=tuple)
 30    meta: Optional[Mapping[str, str]] = None
 31
 32    def __attrs_post_init__(self):
 33        object.__setattr__(self, "aliases", tuple(self.aliases))
 34        object.__setattr__(self, "tags", tuple(self.tags))
 35
 36    @property
 37    def char_and_aliases(self):
 38        return [self.char, *self.aliases]
 39
 40    @property
 41    def unicode_id(self):
 42        return hex(ord(self.char)).upper()[2:]
 43
 44
 45KEY_NO_TAG = '__no_tag'
 46
 47
 48@attrs.define
 49class LexiconCollection:
 50    lexicons: Sequence[Lexicon]
 51
 52    _char_to_lexicon: Optional[Mapping[str, Lexicon]] = attrs_lazy_field()
 53    _tag_to_lexicons: Optional[Mapping[str, Sequence[Lexicon]]] = attrs_lazy_field()
 54    _tags: Optional[Sequence[str]] = attrs_lazy_field()
 55
 56    def lazy_post_init(self):
 57        initialized = (self._char_to_lexicon is not None)
 58        if initialized:
 59            return
 60
 61        self._char_to_lexicon = {}
 62        for lexicon in self.lexicons:
 63            for char in lexicon.char_and_aliases:
 64                assert char not in self._char_to_lexicon
 65                self._char_to_lexicon[char] = lexicon
 66
 67        tag_to_lexicons: DefaultDict[str, List[Lexicon]] = defaultdict(list)
 68        for lexicon in self.lexicons:
 69            if lexicon.tags:
 70                for tag in lexicon.tags:
 71                    tag_to_lexicons[tag].append(lexicon)
 72            else:
 73                tag_to_lexicons[KEY_NO_TAG].append(lexicon)
 74        self._tag_to_lexicons = dict(tag_to_lexicons)
 75        self._tags = sorted(self._tag_to_lexicons)
 76
 77    @property
 78    def char_to_lexicon(self):
 79        self.lazy_post_init()
 80        return unwrap_optional_field(self._char_to_lexicon)
 81
 82    @property
 83    def tag_to_lexicons(self):
 84        self.lazy_post_init()
 85        return unwrap_optional_field(self._tag_to_lexicons)
 86
 87    @property
 88    def tags(self):
 89        self.lazy_post_init()
 90        return unwrap_optional_field(self._tags)
 91
 92    def has_char(self, char: str):
 93        return char in self.char_to_lexicon
 94
 95    def get_lexicon(self, char: str):
 96        return self.char_to_lexicon[char]
 97
 98    @classmethod
 99    def from_file(cls, path: PathType):
100        lexicons = dyn_structure(path, Sequence[Lexicon], force_path_type=True)
101        return cls(lexicons=lexicons)
102
103    def to_file(self, path: PathType):
104        io.write_json(
105            path,
106            cattrs.unstructure(self.lexicons),
107            indent=2,
108            ensure_ascii=False,
109        )
110
111    def get_hash(self):
112        sha256_algo = hashlib.sha256()
113        for lexicon in self.lexicons:
114            sha256_algo.update(lexicon.char.encode())
115            for alias in lexicon.aliases:
116                sha256_algo.update(alias.encode())
117        return sha256_algo.hexdigest()
class Lexicon:
27class Lexicon:
28    char: str
29    aliases: Sequence[str] = attrs.field(factory=tuple)
30    tags: Sequence[str] = attrs.field(factory=tuple)
31    meta: Optional[Mapping[str, str]] = None
32
33    def __attrs_post_init__(self):
34        object.__setattr__(self, "aliases", tuple(self.aliases))
35        object.__setattr__(self, "tags", tuple(self.tags))
36
37    @property
38    def char_and_aliases(self):
39        return [self.char, *self.aliases]
40
41    @property
42    def unicode_id(self):
43        return hex(ord(self.char)).upper()[2:]
Lexicon( char: str, aliases: Sequence[str] = NOTHING, tags: Sequence[str] = NOTHING, meta: Union[Mapping[str, str], NoneType] = None)
 2def __init__(self, char, aliases=NOTHING, tags=NOTHING, meta=attr_dict['meta'].default):
 3    _setattr = _cached_setattr_get(self)
 4    _setattr('char', char)
 5    if aliases is not NOTHING:
 6        _setattr('aliases', aliases)
 7    else:
 8        _setattr('aliases', __attr_factory_aliases())
 9    if tags is not NOTHING:
10        _setattr('tags', tags)
11    else:
12        _setattr('tags', __attr_factory_tags())
13    _setattr('meta', meta)
14    self.__attrs_post_init__()

Method generated by attrs for class Lexicon.

class LexiconCollection:
 50class LexiconCollection:
 51    lexicons: Sequence[Lexicon]
 52
 53    _char_to_lexicon: Optional[Mapping[str, Lexicon]] = attrs_lazy_field()
 54    _tag_to_lexicons: Optional[Mapping[str, Sequence[Lexicon]]] = attrs_lazy_field()
 55    _tags: Optional[Sequence[str]] = attrs_lazy_field()
 56
 57    def lazy_post_init(self):
 58        initialized = (self._char_to_lexicon is not None)
 59        if initialized:
 60            return
 61
 62        self._char_to_lexicon = {}
 63        for lexicon in self.lexicons:
 64            for char in lexicon.char_and_aliases:
 65                assert char not in self._char_to_lexicon
 66                self._char_to_lexicon[char] = lexicon
 67
 68        tag_to_lexicons: DefaultDict[str, List[Lexicon]] = defaultdict(list)
 69        for lexicon in self.lexicons:
 70            if lexicon.tags:
 71                for tag in lexicon.tags:
 72                    tag_to_lexicons[tag].append(lexicon)
 73            else:
 74                tag_to_lexicons[KEY_NO_TAG].append(lexicon)
 75        self._tag_to_lexicons = dict(tag_to_lexicons)
 76        self._tags = sorted(self._tag_to_lexicons)
 77
 78    @property
 79    def char_to_lexicon(self):
 80        self.lazy_post_init()
 81        return unwrap_optional_field(self._char_to_lexicon)
 82
 83    @property
 84    def tag_to_lexicons(self):
 85        self.lazy_post_init()
 86        return unwrap_optional_field(self._tag_to_lexicons)
 87
 88    @property
 89    def tags(self):
 90        self.lazy_post_init()
 91        return unwrap_optional_field(self._tags)
 92
 93    def has_char(self, char: str):
 94        return char in self.char_to_lexicon
 95
 96    def get_lexicon(self, char: str):
 97        return self.char_to_lexicon[char]
 98
 99    @classmethod
100    def from_file(cls, path: PathType):
101        lexicons = dyn_structure(path, Sequence[Lexicon], force_path_type=True)
102        return cls(lexicons=lexicons)
103
104    def to_file(self, path: PathType):
105        io.write_json(
106            path,
107            cattrs.unstructure(self.lexicons),
108            indent=2,
109            ensure_ascii=False,
110        )
111
112    def get_hash(self):
113        sha256_algo = hashlib.sha256()
114        for lexicon in self.lexicons:
115            sha256_algo.update(lexicon.char.encode())
116            for alias in lexicon.aliases:
117                sha256_algo.update(alias.encode())
118        return sha256_algo.hexdigest()
LexiconCollection(lexicons: Sequence[vkit.element.lexicon.Lexicon])
2def __init__(self, lexicons):
3    self.lexicons = lexicons
4    self._char_to_lexicon = attr_dict['_char_to_lexicon'].default
5    self._tag_to_lexicons = attr_dict['_tag_to_lexicons'].default
6    self._tags = attr_dict['_tags'].default

Method generated by attrs for class LexiconCollection.

def lazy_post_init(self):
57    def lazy_post_init(self):
58        initialized = (self._char_to_lexicon is not None)
59        if initialized:
60            return
61
62        self._char_to_lexicon = {}
63        for lexicon in self.lexicons:
64            for char in lexicon.char_and_aliases:
65                assert char not in self._char_to_lexicon
66                self._char_to_lexicon[char] = lexicon
67
68        tag_to_lexicons: DefaultDict[str, List[Lexicon]] = defaultdict(list)
69        for lexicon in self.lexicons:
70            if lexicon.tags:
71                for tag in lexicon.tags:
72                    tag_to_lexicons[tag].append(lexicon)
73            else:
74                tag_to_lexicons[KEY_NO_TAG].append(lexicon)
75        self._tag_to_lexicons = dict(tag_to_lexicons)
76        self._tags = sorted(self._tag_to_lexicons)
def has_char(self, char: str):
93    def has_char(self, char: str):
94        return char in self.char_to_lexicon
def get_lexicon(self, char: str):
96    def get_lexicon(self, char: str):
97        return self.char_to_lexicon[char]
@classmethod
def from_file(cls, path: Union[str, os.PathLike]):
 99    @classmethod
100    def from_file(cls, path: PathType):
101        lexicons = dyn_structure(path, Sequence[Lexicon], force_path_type=True)
102        return cls(lexicons=lexicons)
def to_file(self, path: Union[str, os.PathLike]):
104    def to_file(self, path: PathType):
105        io.write_json(
106            path,
107            cattrs.unstructure(self.lexicons),
108            indent=2,
109            ensure_ascii=False,
110        )
def get_hash(self):
112    def get_hash(self):
113        sha256_algo = hashlib.sha256()
114        for lexicon in self.lexicons:
115            sha256_algo.update(lexicon.char.encode())
116            for alias in lexicon.aliases:
117                sha256_algo.update(alias.encode())
118        return sha256_algo.hexdigest()