vkit.engine.char_sampler.lexicon
1# Copyright 2022 vkit-x Administrator. All Rights Reserved. 2# 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses. 4# 5# The commercial license gives you the full rights to create and distribute software 6# on your own terms without any SSPL license obligations. For more information, 7# please see the "LICENSE_COMMERCIAL.txt" file. 8# 9# This project is also available under Server Side Public License (SSPL). 10# The SSPL licensing is ideal for use cases such as open source projects with 11# SSPL distribution, student/academic purposes, hobby projects, internal research 12# projects without external distribution, or other projects where all SSPL 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file. 14from typing import Sequence, Mapping, Optional, List 15 16import attrs 17from numpy.random import Generator as RandomGenerator 18 19from vkit.utility import rng_choice, normalize_to_probs 20from ..interface import Engine, EngineExecutorFactory 21from .type import CharSamplerEngineInitResource, CharSamplerEngineRunConfig 22 23 24@attrs.define 25class CharSamplerLexiconEngineInitConfig: 26 tag_to_weight: Optional[Mapping[str, float]] = None 27 prob_space: float = 0.0 28 29 30CharSamplerLexiconEngineInitResource = CharSamplerEngineInitResource 31 32 33class CharSamplerLexiconEngine( 34 Engine[ 35 CharSamplerLexiconEngineInitConfig, 36 CharSamplerLexiconEngineInitResource, 37 CharSamplerEngineRunConfig, 38 Sequence[str], 39 ] 40): # yapf: disable 41 42 KEY_SPACE = '__space' 43 44 @classmethod 45 def get_type_name(cls) -> str: 46 return 'lexicon' 47 48 def __init__( 49 self, 50 init_config: CharSamplerLexiconEngineInitConfig, 51 init_resource: Optional[CharSamplerLexiconEngineInitResource] = None, 52 ): 53 super().__init__(init_config, init_resource) 54 55 assert init_resource 56 self.lexicon_collection = init_resource.lexicon_collection 57 58 tag_weights = [] 59 for tag in self.lexicon_collection.tags: 60 if init_config.tag_to_weight: 61 # From config. 62 if tag not in init_config.tag_to_weight: 63 raise RuntimeError(f'missing tag={tag} in tag_to_weight') 64 weight = init_config.tag_to_weight[tag] 65 else: 66 # Based on the number of tagged lexicons. 67 weight = len(self.lexicon_collection.tag_to_lexicons[tag]) 68 tag_weights.append(weight) 69 70 self.tags = self.lexicon_collection.tags 71 self.tag_probs = normalize_to_probs(tag_weights) 72 73 self.with_space_tags = self.tags 74 self.with_space_tag_probs = self.tag_probs 75 if init_config.prob_space > 0.0: 76 self.with_space_tags = (*self.tags, self.KEY_SPACE) 77 self.with_space_tag_probs = normalize_to_probs(( 78 *self.tag_probs, 79 init_config.prob_space / (1 - init_config.prob_space), 80 )) 81 82 def run( 83 self, 84 run_config: CharSamplerEngineRunConfig, 85 rng: Optional[RandomGenerator] = None, 86 ) -> Sequence[str]: 87 assert rng is not None 88 89 num_chars = run_config.num_chars 90 91 if run_config.enable_aggregator_mode: 92 num_chars = int(rng.integers(1, run_config.num_chars + 1)) 93 94 chars: List[str] = [] 95 for char_idx in range(num_chars): 96 tag = rng_choice(rng, self.with_space_tags, probs=self.with_space_tag_probs) 97 if tag == self.KEY_SPACE: 98 if char_idx == 0 \ 99 or char_idx == num_chars - 1 \ 100 or chars[char_idx - 1].isspace(): 101 # Disallow: 102 # 1. leading or trailing space. 103 # 2. consecutive spaces. 104 tag = rng_choice(rng, self.tags, probs=self.tag_probs) 105 106 if tag == self.KEY_SPACE: 107 chars.append(' ') 108 else: 109 lexicon = rng_choice(rng, self.lexicon_collection.tag_to_lexicons[tag]) 110 char = rng_choice(rng, lexicon.char_and_aliases) 111 chars.append(char) 112 113 return chars 114 115 116char_sampler_lexicon_engine_executor_factory = EngineExecutorFactory(CharSamplerLexiconEngine)
class
CharSamplerLexiconEngineInitConfig:
26class CharSamplerLexiconEngineInitConfig: 27 tag_to_weight: Optional[Mapping[str, float]] = None 28 prob_space: float = 0.0
CharSamplerLexiconEngineInitConfig( tag_to_weight: Union[Mapping[str, float], NoneType] = None, prob_space: float = 0.0)
2def __init__(self, tag_to_weight=attr_dict['tag_to_weight'].default, prob_space=attr_dict['prob_space'].default): 3 self.tag_to_weight = tag_to_weight 4 self.prob_space = prob_space
Method generated by attrs for class CharSamplerLexiconEngineInitConfig.
class
CharSamplerLexiconEngine(vkit.engine.interface.Engine[vkit.engine.char_sampler.lexicon.CharSamplerLexiconEngineInitConfig, vkit.engine.char_sampler.type.CharSamplerEngineInitResource, vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, typing.Sequence[str]]):
34class CharSamplerLexiconEngine( 35 Engine[ 36 CharSamplerLexiconEngineInitConfig, 37 CharSamplerLexiconEngineInitResource, 38 CharSamplerEngineRunConfig, 39 Sequence[str], 40 ] 41): # yapf: disable 42 43 KEY_SPACE = '__space' 44 45 @classmethod 46 def get_type_name(cls) -> str: 47 return 'lexicon' 48 49 def __init__( 50 self, 51 init_config: CharSamplerLexiconEngineInitConfig, 52 init_resource: Optional[CharSamplerLexiconEngineInitResource] = None, 53 ): 54 super().__init__(init_config, init_resource) 55 56 assert init_resource 57 self.lexicon_collection = init_resource.lexicon_collection 58 59 tag_weights = [] 60 for tag in self.lexicon_collection.tags: 61 if init_config.tag_to_weight: 62 # From config. 63 if tag not in init_config.tag_to_weight: 64 raise RuntimeError(f'missing tag={tag} in tag_to_weight') 65 weight = init_config.tag_to_weight[tag] 66 else: 67 # Based on the number of tagged lexicons. 68 weight = len(self.lexicon_collection.tag_to_lexicons[tag]) 69 tag_weights.append(weight) 70 71 self.tags = self.lexicon_collection.tags 72 self.tag_probs = normalize_to_probs(tag_weights) 73 74 self.with_space_tags = self.tags 75 self.with_space_tag_probs = self.tag_probs 76 if init_config.prob_space > 0.0: 77 self.with_space_tags = (*self.tags, self.KEY_SPACE) 78 self.with_space_tag_probs = normalize_to_probs(( 79 *self.tag_probs, 80 init_config.prob_space / (1 - init_config.prob_space), 81 )) 82 83 def run( 84 self, 85 run_config: CharSamplerEngineRunConfig, 86 rng: Optional[RandomGenerator] = None, 87 ) -> Sequence[str]: 88 assert rng is not None 89 90 num_chars = run_config.num_chars 91 92 if run_config.enable_aggregator_mode: 93 num_chars = int(rng.integers(1, run_config.num_chars + 1)) 94 95 chars: List[str] = [] 96 for char_idx in range(num_chars): 97 tag = rng_choice(rng, self.with_space_tags, probs=self.with_space_tag_probs) 98 if tag == self.KEY_SPACE: 99 if char_idx == 0 \ 100 or char_idx == num_chars - 1 \ 101 or chars[char_idx - 1].isspace(): 102 # Disallow: 103 # 1. leading or trailing space. 104 # 2. consecutive spaces. 105 tag = rng_choice(rng, self.tags, probs=self.tag_probs) 106 107 if tag == self.KEY_SPACE: 108 chars.append(' ') 109 else: 110 lexicon = rng_choice(rng, self.lexicon_collection.tag_to_lexicons[tag]) 111 char = rng_choice(rng, lexicon.char_and_aliases) 112 chars.append(char) 113 114 return chars
Abstract base class for generic types.
A generic type is typically declared by inheriting from this class parameterized with one or more type variables. For example, a generic mapping type might be defined as::
class Mapping(Generic[KT, VT]): def __getitem__(self, key: KT) -> VT: ... # Etc.
This class can then be used as follows::
def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT: try: return mapping[key] except KeyError: return default
CharSamplerLexiconEngine( init_config: vkit.engine.char_sampler.lexicon.CharSamplerLexiconEngineInitConfig, init_resource: Union[vkit.engine.char_sampler.type.CharSamplerEngineInitResource, NoneType] = None)
49 def __init__( 50 self, 51 init_config: CharSamplerLexiconEngineInitConfig, 52 init_resource: Optional[CharSamplerLexiconEngineInitResource] = None, 53 ): 54 super().__init__(init_config, init_resource) 55 56 assert init_resource 57 self.lexicon_collection = init_resource.lexicon_collection 58 59 tag_weights = [] 60 for tag in self.lexicon_collection.tags: 61 if init_config.tag_to_weight: 62 # From config. 63 if tag not in init_config.tag_to_weight: 64 raise RuntimeError(f'missing tag={tag} in tag_to_weight') 65 weight = init_config.tag_to_weight[tag] 66 else: 67 # Based on the number of tagged lexicons. 68 weight = len(self.lexicon_collection.tag_to_lexicons[tag]) 69 tag_weights.append(weight) 70 71 self.tags = self.lexicon_collection.tags 72 self.tag_probs = normalize_to_probs(tag_weights) 73 74 self.with_space_tags = self.tags 75 self.with_space_tag_probs = self.tag_probs 76 if init_config.prob_space > 0.0: 77 self.with_space_tags = (*self.tags, self.KEY_SPACE) 78 self.with_space_tag_probs = normalize_to_probs(( 79 *self.tag_probs, 80 init_config.prob_space / (1 - init_config.prob_space), 81 ))
def
run( self, run_config: vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, rng: Union[numpy.random._generator.Generator, NoneType] = None) -> Sequence[str]:
83 def run( 84 self, 85 run_config: CharSamplerEngineRunConfig, 86 rng: Optional[RandomGenerator] = None, 87 ) -> Sequence[str]: 88 assert rng is not None 89 90 num_chars = run_config.num_chars 91 92 if run_config.enable_aggregator_mode: 93 num_chars = int(rng.integers(1, run_config.num_chars + 1)) 94 95 chars: List[str] = [] 96 for char_idx in range(num_chars): 97 tag = rng_choice(rng, self.with_space_tags, probs=self.with_space_tag_probs) 98 if tag == self.KEY_SPACE: 99 if char_idx == 0 \ 100 or char_idx == num_chars - 1 \ 101 or chars[char_idx - 1].isspace(): 102 # Disallow: 103 # 1. leading or trailing space. 104 # 2. consecutive spaces. 105 tag = rng_choice(rng, self.tags, probs=self.tag_probs) 106 107 if tag == self.KEY_SPACE: 108 chars.append(' ') 109 else: 110 lexicon = rng_choice(rng, self.lexicon_collection.tag_to_lexicons[tag]) 111 char = rng_choice(rng, lexicon.char_and_aliases) 112 chars.append(char) 113 114 return chars