vkit.engine.char_sampler.lexicon

View Source

  1# Copyright 2022 vkit-x Administrator. All Rights Reserved.
  2#
  3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses.
  4#
  5# The commercial license gives you the full rights to create and distribute software
  6# on your own terms without any SSPL license obligations. For more information,
  7# please see the "LICENSE_COMMERCIAL.txt" file.
  8#
  9# This project is also available under Server Side Public License (SSPL).
 10# The SSPL licensing is ideal for use cases such as open source projects with
 11# SSPL distribution, student/academic purposes, hobby projects, internal research
 12# projects without external distribution, or other projects where all SSPL
 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file.
 14from typing import Sequence, Mapping, Optional, List
 15
 16import attrs
 17from numpy.random import Generator as RandomGenerator
 18
 19from vkit.utility import rng_choice, normalize_to_probs
 20from ..interface import Engine, EngineExecutorFactory
 21from .type import CharSamplerEngineInitResource, CharSamplerEngineRunConfig
 22
 23
 24@attrs.define
 25class CharSamplerLexiconEngineInitConfig:
 26    tag_to_weight: Optional[Mapping[str, float]] = None
 27    prob_space: float = 0.0
 28
 29
 30CharSamplerLexiconEngineInitResource = CharSamplerEngineInitResource
 31
 32
 33class CharSamplerLexiconEngine(
 34    Engine[
 35        CharSamplerLexiconEngineInitConfig,
 36        CharSamplerLexiconEngineInitResource,
 37        CharSamplerEngineRunConfig,
 38        Sequence[str],
 39    ]
 40):  # yapf: disable
 41
 42    KEY_SPACE = '__space'
 43
 44    @classmethod
 45    def get_type_name(cls) -> str:
 46        return 'lexicon'
 47
 48    def __init__(
 49        self,
 50        init_config: CharSamplerLexiconEngineInitConfig,
 51        init_resource: Optional[CharSamplerLexiconEngineInitResource] = None,
 52    ):
 53        super().__init__(init_config, init_resource)
 54
 55        assert init_resource
 56        self.lexicon_collection = init_resource.lexicon_collection
 57
 58        tag_weights = []
 59        for tag in self.lexicon_collection.tags:
 60            if init_config.tag_to_weight:
 61                # From config.
 62                if tag not in init_config.tag_to_weight:
 63                    raise RuntimeError(f'missing tag={tag} in tag_to_weight')
 64                weight = init_config.tag_to_weight[tag]
 65            else:
 66                # Based on the number of tagged lexicons.
 67                weight = len(self.lexicon_collection.tag_to_lexicons[tag])
 68            tag_weights.append(weight)
 69
 70        self.tags = self.lexicon_collection.tags
 71        self.tag_probs = normalize_to_probs(tag_weights)
 72
 73        self.with_space_tags = self.tags
 74        self.with_space_tag_probs = self.tag_probs
 75        if init_config.prob_space > 0.0:
 76            self.with_space_tags = (*self.tags, self.KEY_SPACE)
 77            self.with_space_tag_probs = normalize_to_probs((
 78                *self.tag_probs,
 79                init_config.prob_space / (1 - init_config.prob_space),
 80            ))
 81
 82    def run(
 83        self,
 84        run_config: CharSamplerEngineRunConfig,
 85        rng: Optional[RandomGenerator] = None,
 86    ) -> Sequence[str]:
 87        assert rng is not None
 88
 89        num_chars = run_config.num_chars
 90
 91        if run_config.enable_aggregator_mode:
 92            num_chars = int(rng.integers(1, run_config.num_chars + 1))
 93
 94        chars: List[str] = []
 95        for char_idx in range(num_chars):
 96            tag = rng_choice(rng, self.with_space_tags, probs=self.with_space_tag_probs)
 97            if tag == self.KEY_SPACE:
 98                if char_idx == 0 \
 99                        or char_idx == num_chars - 1 \
100                        or chars[char_idx - 1].isspace():
101                    # Disallow:
102                    # 1. leading or trailing space.
103                    # 2. consecutive spaces.
104                    tag = rng_choice(rng, self.tags, probs=self.tag_probs)
105
106            if tag == self.KEY_SPACE:
107                chars.append(' ')
108            else:
109                lexicon = rng_choice(rng, self.lexicon_collection.tag_to_lexicons[tag])
110                char = rng_choice(rng, lexicon.char_and_aliases)
111                chars.append(char)
112
113        return chars
114
115
116char_sampler_lexicon_engine_executor_factory = EngineExecutorFactory(CharSamplerLexiconEngine)

class CharSamplerLexiconEngineInitConfig: View Source

26class CharSamplerLexiconEngineInitConfig:
27    tag_to_weight: Optional[Mapping[str, float]] = None
28    prob_space: float = 0.0

CharSamplerLexiconEngineInitConfig( tag_to_weight: Union[Mapping[str, float], NoneType] = None, prob_space: float = 0.0) View Source

2def __init__(self, tag_to_weight=attr_dict['tag_to_weight'].default, prob_space=attr_dict['prob_space'].default):
3    self.tag_to_weight = tag_to_weight
4    self.prob_space = prob_space

Method generated by attrs for class CharSamplerLexiconEngineInitConfig.

class CharSamplerLexiconEngine(vkit.engine.interface.Engine[vkit.engine.char_sampler.lexicon.CharSamplerLexiconEngineInitConfig, vkit.engine.char_sampler.type.CharSamplerEngineInitResource, vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, typing.Sequence[str]]): View Source

 34class CharSamplerLexiconEngine(
 35    Engine[
 36        CharSamplerLexiconEngineInitConfig,
 37        CharSamplerLexiconEngineInitResource,
 38        CharSamplerEngineRunConfig,
 39        Sequence[str],
 40    ]
 41):  # yapf: disable
 42
 43    KEY_SPACE = '__space'
 44
 45    @classmethod
 46    def get_type_name(cls) -> str:
 47        return 'lexicon'
 48
 49    def __init__(
 50        self,
 51        init_config: CharSamplerLexiconEngineInitConfig,
 52        init_resource: Optional[CharSamplerLexiconEngineInitResource] = None,
 53    ):
 54        super().__init__(init_config, init_resource)
 55
 56        assert init_resource
 57        self.lexicon_collection = init_resource.lexicon_collection
 58
 59        tag_weights = []
 60        for tag in self.lexicon_collection.tags:
 61            if init_config.tag_to_weight:
 62                # From config.
 63                if tag not in init_config.tag_to_weight:
 64                    raise RuntimeError(f'missing tag={tag} in tag_to_weight')
 65                weight = init_config.tag_to_weight[tag]
 66            else:
 67                # Based on the number of tagged lexicons.
 68                weight = len(self.lexicon_collection.tag_to_lexicons[tag])
 69            tag_weights.append(weight)
 70
 71        self.tags = self.lexicon_collection.tags
 72        self.tag_probs = normalize_to_probs(tag_weights)
 73
 74        self.with_space_tags = self.tags
 75        self.with_space_tag_probs = self.tag_probs
 76        if init_config.prob_space > 0.0:
 77            self.with_space_tags = (*self.tags, self.KEY_SPACE)
 78            self.with_space_tag_probs = normalize_to_probs((
 79                *self.tag_probs,
 80                init_config.prob_space / (1 - init_config.prob_space),
 81            ))
 82
 83    def run(
 84        self,
 85        run_config: CharSamplerEngineRunConfig,
 86        rng: Optional[RandomGenerator] = None,
 87    ) -> Sequence[str]:
 88        assert rng is not None
 89
 90        num_chars = run_config.num_chars
 91
 92        if run_config.enable_aggregator_mode:
 93            num_chars = int(rng.integers(1, run_config.num_chars + 1))
 94
 95        chars: List[str] = []
 96        for char_idx in range(num_chars):
 97            tag = rng_choice(rng, self.with_space_tags, probs=self.with_space_tag_probs)
 98            if tag == self.KEY_SPACE:
 99                if char_idx == 0 \
100                        or char_idx == num_chars - 1 \
101                        or chars[char_idx - 1].isspace():
102                    # Disallow:
103                    # 1. leading or trailing space.
104                    # 2. consecutive spaces.
105                    tag = rng_choice(rng, self.tags, probs=self.tag_probs)
106
107            if tag == self.KEY_SPACE:
108                chars.append(' ')
109            else:
110                lexicon = rng_choice(rng, self.lexicon_collection.tag_to_lexicons[tag])
111                char = rng_choice(rng, lexicon.char_and_aliases)
112                chars.append(char)
113
114        return chars

Abstract base class for generic types.

A generic type is typically declared by inheriting from this class parameterized with one or more type variables. For example, a generic mapping type might be defined as::

class Mapping(Generic[KT, VT]): def __getitem__(self, key: KT) -> VT: ... # Etc.

This class can then be used as follows::

def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT: try: return mapping[key] except KeyError: return default

CharSamplerLexiconEngine( init_config: vkit.engine.char_sampler.lexicon.CharSamplerLexiconEngineInitConfig, init_resource: Union[vkit.engine.char_sampler.type.CharSamplerEngineInitResource, NoneType] = None) View Source

49    def __init__(
50        self,
51        init_config: CharSamplerLexiconEngineInitConfig,
52        init_resource: Optional[CharSamplerLexiconEngineInitResource] = None,
53    ):
54        super().__init__(init_config, init_resource)
55
56        assert init_resource
57        self.lexicon_collection = init_resource.lexicon_collection
58
59        tag_weights = []
60        for tag in self.lexicon_collection.tags:
61            if init_config.tag_to_weight:
62                # From config.
63                if tag not in init_config.tag_to_weight:
64                    raise RuntimeError(f'missing tag={tag} in tag_to_weight')
65                weight = init_config.tag_to_weight[tag]
66            else:
67                # Based on the number of tagged lexicons.
68                weight = len(self.lexicon_collection.tag_to_lexicons[tag])
69            tag_weights.append(weight)
70
71        self.tags = self.lexicon_collection.tags
72        self.tag_probs = normalize_to_probs(tag_weights)
73
74        self.with_space_tags = self.tags
75        self.with_space_tag_probs = self.tag_probs
76        if init_config.prob_space > 0.0:
77            self.with_space_tags = (*self.tags, self.KEY_SPACE)
78            self.with_space_tag_probs = normalize_to_probs((
79                *self.tag_probs,
80                init_config.prob_space / (1 - init_config.prob_space),
81            ))

@classmethod

def get_type_name(cls) -> str: View Source

45    @classmethod
46    def get_type_name(cls) -> str:
47        return 'lexicon'

def run( self, run_config: vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, rng: Union[numpy.random._generator.Generator, NoneType] = None) -> Sequence[str]: View Source

 83    def run(
 84        self,
 85        run_config: CharSamplerEngineRunConfig,
 86        rng: Optional[RandomGenerator] = None,
 87    ) -> Sequence[str]:
 88        assert rng is not None
 89
 90        num_chars = run_config.num_chars
 91
 92        if run_config.enable_aggregator_mode:
 93            num_chars = int(rng.integers(1, run_config.num_chars + 1))
 94
 95        chars: List[str] = []
 96        for char_idx in range(num_chars):
 97            tag = rng_choice(rng, self.with_space_tags, probs=self.with_space_tag_probs)
 98            if tag == self.KEY_SPACE:
 99                if char_idx == 0 \
100                        or char_idx == num_chars - 1 \
101                        or chars[char_idx - 1].isspace():
102                    # Disallow:
103                    # 1. leading or trailing space.
104                    # 2. consecutive spaces.
105                    tag = rng_choice(rng, self.tags, probs=self.tag_probs)
106
107            if tag == self.KEY_SPACE:
108                chars.append(' ')
109            else:
110                lexicon = rng_choice(rng, self.lexicon_collection.tag_to_lexicons[tag])
111                char = rng_choice(rng, lexicon.char_and_aliases)
112                chars.append(char)
113
114        return chars