vkit.element.lexicon

  1# Copyright 2022 vkit-x Administrator. All Rights Reserved.
  2#
  3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses.
  4#
  5# The commercial license gives you the full rights to create and distribute software
  6# on your own terms without any SSPL license obligations. For more information,
  7# please see the "LICENSE_COMMERCIAL.txt" file.
  8#
  9# This project is also available under Server Side Public License (SSPL).
 10# The SSPL licensing is ideal for use cases such as open source projects with
 11# SSPL distribution, student/academic purposes, hobby projects, internal research
 12# projects without external distribution, or other projects where all SSPL
 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file.
 14from typing import Mapping, Sequence, Optional, DefaultDict, List
 15from collections import defaultdict
 16import hashlib
 17
 18import attrs
 19import cattrs
 20import iolite as io
 21
 22from vkit.utility import attrs_lazy_field, dyn_structure, PathType
 23
 24
 25@attrs.define(frozen=True)
 26class Lexicon:
 27    char: str
 28    aliases: Sequence[str] = attrs.field(factory=tuple)
 29    tags: Sequence[str] = attrs.field(factory=tuple)
 30    meta: Optional[Mapping[str, str]] = None
 31
 32    def __attrs_post_init__(self):
 33        object.__setattr__(self, "aliases", tuple(self.aliases))
 34        object.__setattr__(self, "tags", tuple(self.tags))
 35
 36    @property
 37    def char_and_aliases(self):
 38        return [self.char, *self.aliases]
 39
 40    @property
 41    def unicode_id(self):
 42        return hex(ord(self.char)).upper()[2:]
 43
 44
 45KEY_NO_TAG = '__no_tag'
 46
 47
 48@attrs.define
 49class LexiconCollection:
 50    lexicons: Sequence[Lexicon]
 51
 52    _char_to_lexicon: Optional[Mapping[str, Lexicon]] = attrs_lazy_field()
 53    _tag_to_lexicons: Optional[Mapping[str, Sequence[Lexicon]]] = attrs_lazy_field()
 54    _tags: Optional[Sequence[str]] = attrs_lazy_field()
 55
 56    def lazy_post_init(self):
 57        initialized = (self._char_to_lexicon is not None)
 58        if initialized:
 59            return
 60
 61        self._char_to_lexicon = {}
 62        for lexicon in self.lexicons:
 63            for char in lexicon.char_and_aliases:
 64                assert char not in self._char_to_lexicon
 65                self._char_to_lexicon[char] = lexicon
 66
 67        tag_to_lexicons: DefaultDict[str, List[Lexicon]] = defaultdict(list)
 68        for lexicon in self.lexicons:
 69            if lexicon.tags:
 70                for tag in lexicon.tags:
 71                    tag_to_lexicons[tag].append(lexicon)
 72            else:
 73                tag_to_lexicons[KEY_NO_TAG].append(lexicon)
 74        self._tag_to_lexicons = dict(tag_to_lexicons)
 75        self._tags = sorted(self._tag_to_lexicons)
 76
 77    @property
 78    def char_to_lexicon(self):
 79        self.lazy_post_init()
 80        assert self._char_to_lexicon is not None
 81        return self._char_to_lexicon
 82
 83    @property
 84    def tag_to_lexicons(self):
 85        self.lazy_post_init()
 86        assert self._tag_to_lexicons is not None
 87        return self._tag_to_lexicons
 88
 89    @property
 90    def tags(self):
 91        self.lazy_post_init()
 92        assert self._tags is not None
 93        return self._tags
 94
 95    def has_char(self, char: str):
 96        return char in self.char_to_lexicon
 97
 98    def get_lexicon(self, char: str):
 99        return self.char_to_lexicon[char]
100
101    @classmethod
102    def from_file(cls, path: PathType):
103        lexicons = dyn_structure(path, Sequence[Lexicon], force_path_type=True)
104        return cls(lexicons=lexicons)
105
106    def to_file(self, path: PathType):
107        io.write_json(
108            path,
109            cattrs.unstructure(self.lexicons),
110            indent=2,
111            ensure_ascii=False,
112        )
113
114    def get_hash(self):
115        sha256_algo = hashlib.sha256()
116        for lexicon in self.lexicons:
117            sha256_algo.update(lexicon.char.encode())
118            for alias in lexicon.aliases:
119                sha256_algo.update(alias.encode())
120        return sha256_algo.hexdigest()
class Lexicon:
27class Lexicon:
28    char: str
29    aliases: Sequence[str] = attrs.field(factory=tuple)
30    tags: Sequence[str] = attrs.field(factory=tuple)
31    meta: Optional[Mapping[str, str]] = None
32
33    def __attrs_post_init__(self):
34        object.__setattr__(self, "aliases", tuple(self.aliases))
35        object.__setattr__(self, "tags", tuple(self.tags))
36
37    @property
38    def char_and_aliases(self):
39        return [self.char, *self.aliases]
40
41    @property
42    def unicode_id(self):
43        return hex(ord(self.char)).upper()[2:]
Lexicon( char: str, aliases: Sequence[str] = NOTHING, tags: Sequence[str] = NOTHING, meta: Union[Mapping[str, str], NoneType] = None)
 2def __init__(self, char, aliases=NOTHING, tags=NOTHING, meta=attr_dict['meta'].default):
 3    _setattr(self, 'char', char)
 4    if aliases is not NOTHING:
 5        _setattr(self, 'aliases', aliases)
 6    else:
 7        _setattr(self, 'aliases', __attr_factory_aliases())
 8    if tags is not NOTHING:
 9        _setattr(self, 'tags', tags)
10    else:
11        _setattr(self, 'tags', __attr_factory_tags())
12    _setattr(self, 'meta', meta)
13    self.__attrs_post_init__()

Method generated by attrs for class Lexicon.

class LexiconCollection:
 50class LexiconCollection:
 51    lexicons: Sequence[Lexicon]
 52
 53    _char_to_lexicon: Optional[Mapping[str, Lexicon]] = attrs_lazy_field()
 54    _tag_to_lexicons: Optional[Mapping[str, Sequence[Lexicon]]] = attrs_lazy_field()
 55    _tags: Optional[Sequence[str]] = attrs_lazy_field()
 56
 57    def lazy_post_init(self):
 58        initialized = (self._char_to_lexicon is not None)
 59        if initialized:
 60            return
 61
 62        self._char_to_lexicon = {}
 63        for lexicon in self.lexicons:
 64            for char in lexicon.char_and_aliases:
 65                assert char not in self._char_to_lexicon
 66                self._char_to_lexicon[char] = lexicon
 67
 68        tag_to_lexicons: DefaultDict[str, List[Lexicon]] = defaultdict(list)
 69        for lexicon in self.lexicons:
 70            if lexicon.tags:
 71                for tag in lexicon.tags:
 72                    tag_to_lexicons[tag].append(lexicon)
 73            else:
 74                tag_to_lexicons[KEY_NO_TAG].append(lexicon)
 75        self._tag_to_lexicons = dict(tag_to_lexicons)
 76        self._tags = sorted(self._tag_to_lexicons)
 77
 78    @property
 79    def char_to_lexicon(self):
 80        self.lazy_post_init()
 81        assert self._char_to_lexicon is not None
 82        return self._char_to_lexicon
 83
 84    @property
 85    def tag_to_lexicons(self):
 86        self.lazy_post_init()
 87        assert self._tag_to_lexicons is not None
 88        return self._tag_to_lexicons
 89
 90    @property
 91    def tags(self):
 92        self.lazy_post_init()
 93        assert self._tags is not None
 94        return self._tags
 95
 96    def has_char(self, char: str):
 97        return char in self.char_to_lexicon
 98
 99    def get_lexicon(self, char: str):
100        return self.char_to_lexicon[char]
101
102    @classmethod
103    def from_file(cls, path: PathType):
104        lexicons = dyn_structure(path, Sequence[Lexicon], force_path_type=True)
105        return cls(lexicons=lexicons)
106
107    def to_file(self, path: PathType):
108        io.write_json(
109            path,
110            cattrs.unstructure(self.lexicons),
111            indent=2,
112            ensure_ascii=False,
113        )
114
115    def get_hash(self):
116        sha256_algo = hashlib.sha256()
117        for lexicon in self.lexicons:
118            sha256_algo.update(lexicon.char.encode())
119            for alias in lexicon.aliases:
120                sha256_algo.update(alias.encode())
121        return sha256_algo.hexdigest()
LexiconCollection(lexicons: Sequence[vkit.element.lexicon.Lexicon])
2def __init__(self, lexicons):
3    self.lexicons = lexicons
4    self._char_to_lexicon = attr_dict['_char_to_lexicon'].default
5    self._tag_to_lexicons = attr_dict['_tag_to_lexicons'].default
6    self._tags = attr_dict['_tags'].default

Method generated by attrs for class LexiconCollection.

def lazy_post_init(self):
57    def lazy_post_init(self):
58        initialized = (self._char_to_lexicon is not None)
59        if initialized:
60            return
61
62        self._char_to_lexicon = {}
63        for lexicon in self.lexicons:
64            for char in lexicon.char_and_aliases:
65                assert char not in self._char_to_lexicon
66                self._char_to_lexicon[char] = lexicon
67
68        tag_to_lexicons: DefaultDict[str, List[Lexicon]] = defaultdict(list)
69        for lexicon in self.lexicons:
70            if lexicon.tags:
71                for tag in lexicon.tags:
72                    tag_to_lexicons[tag].append(lexicon)
73            else:
74                tag_to_lexicons[KEY_NO_TAG].append(lexicon)
75        self._tag_to_lexicons = dict(tag_to_lexicons)
76        self._tags = sorted(self._tag_to_lexicons)
def has_char(self, char: str):
96    def has_char(self, char: str):
97        return char in self.char_to_lexicon
def get_lexicon(self, char: str):
 99    def get_lexicon(self, char: str):
100        return self.char_to_lexicon[char]
@classmethod
def from_file(cls, path: Union[str, os.PathLike]):
102    @classmethod
103    def from_file(cls, path: PathType):
104        lexicons = dyn_structure(path, Sequence[Lexicon], force_path_type=True)
105        return cls(lexicons=lexicons)
def to_file(self, path: Union[str, os.PathLike]):
107    def to_file(self, path: PathType):
108        io.write_json(
109            path,
110            cattrs.unstructure(self.lexicons),
111            indent=2,
112            ensure_ascii=False,
113        )
def get_hash(self):
115    def get_hash(self):
116        sha256_algo = hashlib.sha256()
117        for lexicon in self.lexicons:
118            sha256_algo.update(lexicon.char.encode())
119            for alias in lexicon.aliases:
120                sha256_algo.update(alias.encode())
121        return sha256_algo.hexdigest()