vkit.engine.char_sampler.lexicon
1# Copyright 2022 vkit-x Administrator. All Rights Reserved. 2# 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses. 4# 5# The commercial license gives you the full rights to create and distribute software 6# on your own terms without any SSPL license obligations. For more information, 7# please see the "LICENSE_COMMERCIAL.txt" file. 8# 9# This project is also available under Server Side Public License (SSPL). 10# The SSPL licensing is ideal for use cases such as open source projects with 11# SSPL distribution, student/academic purposes, hobby projects, internal research 12# projects without external distribution, or other projects where all SSPL 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file. 14from typing import Sequence, Mapping, Optional, List 15 16import attrs 17from numpy.random import Generator as RandomGenerator 18 19from vkit.utility import rng_choice, normalize_to_probs 20from vkit.engine.interface import Engine, EngineExecutorFactory 21from .type import CharSamplerEngineInitResource, CharSamplerEngineRunConfig 22 23 24@attrs.define 25class CharSamplerLexiconEngineInitConfig: 26 tag_to_weight: Optional[Mapping[str, float]] = None 27 prob_space: float = 0.0 28 29 30CharSamplerLexiconEngineInitResource = CharSamplerEngineInitResource 31 32 33class CharSamplerLexiconEngine( 34 Engine[ 35 CharSamplerLexiconEngineInitConfig, 36 CharSamplerLexiconEngineInitResource, 37 CharSamplerEngineRunConfig, 38 Sequence[str], 39 ] 40): # yapf: disable 41 42 KEY_SPACE = '__space' 43 44 @classmethod 45 def get_type_name(cls) -> str: 46 return 'lexicon' 47 48 def __init__( 49 self, 50 init_config: CharSamplerLexiconEngineInitConfig, 51 init_resource: Optional[CharSamplerLexiconEngineInitResource] = None, 52 ): 53 super().__init__(init_config, init_resource) 54 55 assert init_resource 56 self.lexicon_collection = init_resource.lexicon_collection 57 58 tag_weights = [] 59 for tag in self.lexicon_collection.tags: 60 if init_config.tag_to_weight: 61 # From config. 62 if tag not in init_config.tag_to_weight: 63 raise RuntimeError(f'missing tag={tag} in tag_to_weight') 64 weight = init_config.tag_to_weight[tag] 65 else: 66 # Based on the number of tagged lexicons. 67 weight = len(self.lexicon_collection.tag_to_lexicons[tag]) 68 tag_weights.append(weight) 69 70 self.tags = self.lexicon_collection.tags 71 self.tag_probs = normalize_to_probs(tag_weights) 72 73 self.with_space_tags = self.tags 74 self.with_space_tag_probs = self.tag_probs 75 if init_config.prob_space > 0.0: 76 self.with_space_tags = (*self.tags, self.KEY_SPACE) 77 self.with_space_tag_probs = normalize_to_probs(( 78 *self.tag_probs, 79 init_config.prob_space / (1 - init_config.prob_space), 80 )) 81 82 def run(self, run_config: CharSamplerEngineRunConfig, rng: RandomGenerator) -> Sequence[str]: 83 num_chars = run_config.num_chars 84 85 if run_config.enable_aggregator_mode: 86 num_chars = int(rng.integers(1, run_config.num_chars + 1)) 87 88 chars: List[str] = [] 89 for char_idx in range(num_chars): 90 tag = rng_choice(rng, self.with_space_tags, probs=self.with_space_tag_probs) 91 if tag == self.KEY_SPACE: 92 if char_idx == 0 \ 93 or char_idx == num_chars - 1 \ 94 or chars[char_idx - 1].isspace(): 95 # Disallow: 96 # 1. leading or trailing space. 97 # 2. consecutive spaces. 98 tag = rng_choice(rng, self.tags, probs=self.tag_probs) 99 100 if tag == self.KEY_SPACE: 101 chars.append(' ') 102 else: 103 lexicon = rng_choice(rng, self.lexicon_collection.tag_to_lexicons[tag]) 104 char = rng_choice(rng, lexicon.char_and_aliases) 105 chars.append(char) 106 107 return chars 108 109 110char_sampler_lexicon_engine_executor_factory = EngineExecutorFactory(CharSamplerLexiconEngine)
class
CharSamplerLexiconEngineInitConfig:
26class CharSamplerLexiconEngineInitConfig: 27 tag_to_weight: Optional[Mapping[str, float]] = None 28 prob_space: float = 0.0
CharSamplerLexiconEngineInitConfig( tag_to_weight: Union[Mapping[str, float], NoneType] = None, prob_space: float = 0.0)
2def __init__(self, tag_to_weight=attr_dict['tag_to_weight'].default, prob_space=attr_dict['prob_space'].default): 3 self.tag_to_weight = tag_to_weight 4 self.prob_space = prob_space
Method generated by attrs for class CharSamplerLexiconEngineInitConfig.
class
CharSamplerLexiconEngine(vkit.engine.interface.Engine[vkit.engine.char_sampler.lexicon.CharSamplerLexiconEngineInitConfig, vkit.engine.char_sampler.type.CharSamplerEngineInitResource, vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, typing.Sequence[str]]):
34class CharSamplerLexiconEngine( 35 Engine[ 36 CharSamplerLexiconEngineInitConfig, 37 CharSamplerLexiconEngineInitResource, 38 CharSamplerEngineRunConfig, 39 Sequence[str], 40 ] 41): # yapf: disable 42 43 KEY_SPACE = '__space' 44 45 @classmethod 46 def get_type_name(cls) -> str: 47 return 'lexicon' 48 49 def __init__( 50 self, 51 init_config: CharSamplerLexiconEngineInitConfig, 52 init_resource: Optional[CharSamplerLexiconEngineInitResource] = None, 53 ): 54 super().__init__(init_config, init_resource) 55 56 assert init_resource 57 self.lexicon_collection = init_resource.lexicon_collection 58 59 tag_weights = [] 60 for tag in self.lexicon_collection.tags: 61 if init_config.tag_to_weight: 62 # From config. 63 if tag not in init_config.tag_to_weight: 64 raise RuntimeError(f'missing tag={tag} in tag_to_weight') 65 weight = init_config.tag_to_weight[tag] 66 else: 67 # Based on the number of tagged lexicons. 68 weight = len(self.lexicon_collection.tag_to_lexicons[tag]) 69 tag_weights.append(weight) 70 71 self.tags = self.lexicon_collection.tags 72 self.tag_probs = normalize_to_probs(tag_weights) 73 74 self.with_space_tags = self.tags 75 self.with_space_tag_probs = self.tag_probs 76 if init_config.prob_space > 0.0: 77 self.with_space_tags = (*self.tags, self.KEY_SPACE) 78 self.with_space_tag_probs = normalize_to_probs(( 79 *self.tag_probs, 80 init_config.prob_space / (1 - init_config.prob_space), 81 )) 82 83 def run(self, run_config: CharSamplerEngineRunConfig, rng: RandomGenerator) -> Sequence[str]: 84 num_chars = run_config.num_chars 85 86 if run_config.enable_aggregator_mode: 87 num_chars = int(rng.integers(1, run_config.num_chars + 1)) 88 89 chars: List[str] = [] 90 for char_idx in range(num_chars): 91 tag = rng_choice(rng, self.with_space_tags, probs=self.with_space_tag_probs) 92 if tag == self.KEY_SPACE: 93 if char_idx == 0 \ 94 or char_idx == num_chars - 1 \ 95 or chars[char_idx - 1].isspace(): 96 # Disallow: 97 # 1. leading or trailing space. 98 # 2. consecutive spaces. 99 tag = rng_choice(rng, self.tags, probs=self.tag_probs) 100 101 if tag == self.KEY_SPACE: 102 chars.append(' ') 103 else: 104 lexicon = rng_choice(rng, self.lexicon_collection.tag_to_lexicons[tag]) 105 char = rng_choice(rng, lexicon.char_and_aliases) 106 chars.append(char) 107 108 return chars
Abstract base class for generic types.
A generic type is typically declared by inheriting from this class parameterized with one or more type variables. For example, a generic mapping type might be defined as::
class Mapping(Generic[KT, VT]): def __getitem__(self, key: KT) -> VT: ... # Etc.
This class can then be used as follows::
def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT: try: return mapping[key] except KeyError: return default
CharSamplerLexiconEngine( init_config: vkit.engine.char_sampler.lexicon.CharSamplerLexiconEngineInitConfig, init_resource: Union[vkit.engine.char_sampler.type.CharSamplerEngineInitResource, NoneType] = None)
49 def __init__( 50 self, 51 init_config: CharSamplerLexiconEngineInitConfig, 52 init_resource: Optional[CharSamplerLexiconEngineInitResource] = None, 53 ): 54 super().__init__(init_config, init_resource) 55 56 assert init_resource 57 self.lexicon_collection = init_resource.lexicon_collection 58 59 tag_weights = [] 60 for tag in self.lexicon_collection.tags: 61 if init_config.tag_to_weight: 62 # From config. 63 if tag not in init_config.tag_to_weight: 64 raise RuntimeError(f'missing tag={tag} in tag_to_weight') 65 weight = init_config.tag_to_weight[tag] 66 else: 67 # Based on the number of tagged lexicons. 68 weight = len(self.lexicon_collection.tag_to_lexicons[tag]) 69 tag_weights.append(weight) 70 71 self.tags = self.lexicon_collection.tags 72 self.tag_probs = normalize_to_probs(tag_weights) 73 74 self.with_space_tags = self.tags 75 self.with_space_tag_probs = self.tag_probs 76 if init_config.prob_space > 0.0: 77 self.with_space_tags = (*self.tags, self.KEY_SPACE) 78 self.with_space_tag_probs = normalize_to_probs(( 79 *self.tag_probs, 80 init_config.prob_space / (1 - init_config.prob_space), 81 ))
def
run( self, run_config: vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, rng: numpy.random._generator.Generator) -> Sequence[str]:
83 def run(self, run_config: CharSamplerEngineRunConfig, rng: RandomGenerator) -> Sequence[str]: 84 num_chars = run_config.num_chars 85 86 if run_config.enable_aggregator_mode: 87 num_chars = int(rng.integers(1, run_config.num_chars + 1)) 88 89 chars: List[str] = [] 90 for char_idx in range(num_chars): 91 tag = rng_choice(rng, self.with_space_tags, probs=self.with_space_tag_probs) 92 if tag == self.KEY_SPACE: 93 if char_idx == 0 \ 94 or char_idx == num_chars - 1 \ 95 or chars[char_idx - 1].isspace(): 96 # Disallow: 97 # 1. leading or trailing space. 98 # 2. consecutive spaces. 99 tag = rng_choice(rng, self.tags, probs=self.tag_probs) 100 101 if tag == self.KEY_SPACE: 102 chars.append(' ') 103 else: 104 lexicon = rng_choice(rng, self.lexicon_collection.tag_to_lexicons[tag]) 105 char = rng_choice(rng, lexicon.char_and_aliases) 106 chars.append(char) 107 108 return chars