vkit.engine.char_sampler.faker

  1# Copyright 2022 vkit-x Administrator. All Rights Reserved.
  2#
  3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses.
  4#
  5# The commercial license gives you the full rights to create and distribute software
  6# on your own terms without any SSPL license obligations. For more information,
  7# please see the "LICENSE_COMMERCIAL.txt" file.
  8#
  9# This project is also available under Server Side Public License (SSPL).
 10# The SSPL licensing is ideal for use cases such as open source projects with
 11# SSPL distribution, student/academic purposes, hobby projects, internal research
 12# projects without external distribution, or other projects where all SSPL
 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file.
 14from typing import Sequence, List, Mapping, Optional
 15from collections import OrderedDict
 16
 17import attrs
 18from numpy.random import Generator as RandomGenerator
 19from faker import Faker
 20
 21from vkit.utility import rng_choice, normalize_to_probs
 22from vkit.engine.interface import Engine, EngineExecutorFactory
 23from .type import CharSamplerEngineInitResource, CharSamplerEngineRunConfig
 24
 25
 26@attrs.define
 27class CharSamplerFakerEngineInitConfig:
 28    local_to_weight: Mapping[str, float] = {
 29        'zh_CN': 4,
 30        'zh_TW': 1,
 31        'en_US': 5,
 32    }
 33    method_to_weight: Mapping[str, float] = {
 34        'address': 1,
 35        'ascii_email': 1,
 36        'dga': 1,
 37        'uri': 1,
 38        'word': 10,
 39        'name': 1,
 40        'country_calling_code': 1,
 41        'phone_number': 1,
 42    }
 43
 44
 45CharSamplerFakerEngineInitResource = CharSamplerEngineInitResource
 46
 47
 48class CharSamplerFakerEngine(
 49    Engine[
 50        CharSamplerFakerEngineInitConfig,
 51        CharSamplerFakerEngineInitResource,
 52        CharSamplerEngineRunConfig,
 53        Sequence[str],
 54    ]
 55):  # yapf: disable
 56
 57    @classmethod
 58    def get_type_name(cls) -> str:
 59        return 'faker'
 60
 61    def __init__(
 62        self,
 63        init_config: CharSamplerFakerEngineInitConfig,
 64        init_resource: Optional[CharSamplerFakerEngineInitResource] = None,
 65    ):
 66        super().__init__(init_config, init_resource)
 67
 68        assert init_resource
 69        self.lexicon_collection = init_resource.lexicon_collection
 70
 71        self.methods = sorted(init_config.method_to_weight)
 72        self.methods_probs = normalize_to_probs([
 73            init_config.method_to_weight[method] for method in self.methods
 74        ])
 75
 76        self.faker: Optional[Faker] = None
 77
 78    def sample_from_faker(self, rng: RandomGenerator):
 79        # Faker is not picklable, hence need a lazy initialization.
 80        if self.faker is None:
 81            self.faker = Faker(OrderedDict(self.init_config.local_to_weight))
 82
 83        while True:
 84            method = rng_choice(rng, self.methods, probs=self.methods_probs)
 85            seed: int = rng.bit_generator.state['state']['state']
 86            for local in self.init_config.local_to_weight:
 87                self.faker[local].seed(seed)
 88
 89            text = getattr(self.faker, method)()
 90            segments: List[str] = []
 91            for segment in text.split():
 92                segment = ''.join(
 93                    char for char in segment if self.lexicon_collection.has_char(char)
 94                )
 95                if segment:
 96                    segments.append(segment)
 97            if segments:
 98                return ' '.join(segments)
 99
100    def run(self, run_config: CharSamplerEngineRunConfig, rng: RandomGenerator) -> Sequence[str]:
101        if not run_config.enable_aggregator_mode:
102            num_chars = run_config.num_chars
103
104            texts: List[str] = []
105            num_chars_in_texts = 0
106            while num_chars_in_texts + len(texts) - 1 < num_chars:
107                text = self.sample_from_faker(rng)
108                texts.append(text)
109                num_chars_in_texts += len(text)
110
111            chars = list(' '.join(texts))
112
113            # Trim and make sure the last char is not space.
114            if len(chars) > num_chars:
115                rest = chars[num_chars:]
116                chars = chars[:num_chars]
117                if chars[-1].isspace():
118                    chars.pop()
119                    assert not rest[0].isspace()
120                    chars.append(rest[0])
121
122            return chars
123
124        else:
125            return self.sample_from_faker(rng)
126
127
128char_sampler_faker_engine_executor_factory = EngineExecutorFactory(CharSamplerFakerEngine)
class CharSamplerFakerEngineInitConfig:
28class CharSamplerFakerEngineInitConfig:
29    local_to_weight: Mapping[str, float] = {
30        'zh_CN': 4,
31        'zh_TW': 1,
32        'en_US': 5,
33    }
34    method_to_weight: Mapping[str, float] = {
35        'address': 1,
36        'ascii_email': 1,
37        'dga': 1,
38        'uri': 1,
39        'word': 10,
40        'name': 1,
41        'country_calling_code': 1,
42        'phone_number': 1,
43    }
CharSamplerFakerEngineInitConfig( local_to_weight: Mapping[str, float] = {'zh_CN': 4, 'zh_TW': 1, 'en_US': 5}, method_to_weight: Mapping[str, float] = {'address': 1, 'ascii_email': 1, 'dga': 1, 'uri': 1, 'word': 10, 'name': 1, 'country_calling_code': 1, 'phone_number': 1})
2def __init__(self, local_to_weight=attr_dict['local_to_weight'].default, method_to_weight=attr_dict['method_to_weight'].default):
3    self.local_to_weight = local_to_weight
4    self.method_to_weight = method_to_weight

Method generated by attrs for class CharSamplerFakerEngineInitConfig.

 49class CharSamplerFakerEngine(
 50    Engine[
 51        CharSamplerFakerEngineInitConfig,
 52        CharSamplerFakerEngineInitResource,
 53        CharSamplerEngineRunConfig,
 54        Sequence[str],
 55    ]
 56):  # yapf: disable
 57
 58    @classmethod
 59    def get_type_name(cls) -> str:
 60        return 'faker'
 61
 62    def __init__(
 63        self,
 64        init_config: CharSamplerFakerEngineInitConfig,
 65        init_resource: Optional[CharSamplerFakerEngineInitResource] = None,
 66    ):
 67        super().__init__(init_config, init_resource)
 68
 69        assert init_resource
 70        self.lexicon_collection = init_resource.lexicon_collection
 71
 72        self.methods = sorted(init_config.method_to_weight)
 73        self.methods_probs = normalize_to_probs([
 74            init_config.method_to_weight[method] for method in self.methods
 75        ])
 76
 77        self.faker: Optional[Faker] = None
 78
 79    def sample_from_faker(self, rng: RandomGenerator):
 80        # Faker is not picklable, hence need a lazy initialization.
 81        if self.faker is None:
 82            self.faker = Faker(OrderedDict(self.init_config.local_to_weight))
 83
 84        while True:
 85            method = rng_choice(rng, self.methods, probs=self.methods_probs)
 86            seed: int = rng.bit_generator.state['state']['state']
 87            for local in self.init_config.local_to_weight:
 88                self.faker[local].seed(seed)
 89
 90            text = getattr(self.faker, method)()
 91            segments: List[str] = []
 92            for segment in text.split():
 93                segment = ''.join(
 94                    char for char in segment if self.lexicon_collection.has_char(char)
 95                )
 96                if segment:
 97                    segments.append(segment)
 98            if segments:
 99                return ' '.join(segments)
100
101    def run(self, run_config: CharSamplerEngineRunConfig, rng: RandomGenerator) -> Sequence[str]:
102        if not run_config.enable_aggregator_mode:
103            num_chars = run_config.num_chars
104
105            texts: List[str] = []
106            num_chars_in_texts = 0
107            while num_chars_in_texts + len(texts) - 1 < num_chars:
108                text = self.sample_from_faker(rng)
109                texts.append(text)
110                num_chars_in_texts += len(text)
111
112            chars = list(' '.join(texts))
113
114            # Trim and make sure the last char is not space.
115            if len(chars) > num_chars:
116                rest = chars[num_chars:]
117                chars = chars[:num_chars]
118                if chars[-1].isspace():
119                    chars.pop()
120                    assert not rest[0].isspace()
121                    chars.append(rest[0])
122
123            return chars
124
125        else:
126            return self.sample_from_faker(rng)

Abstract base class for generic types.

A generic type is typically declared by inheriting from this class parameterized with one or more type variables. For example, a generic mapping type might be defined as::

class Mapping(Generic[KT, VT]): def __getitem__(self, key: KT) -> VT: ... # Etc.

This class can then be used as follows::

def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT: try: return mapping[key] except KeyError: return default

CharSamplerFakerEngine( init_config: vkit.engine.char_sampler.faker.CharSamplerFakerEngineInitConfig, init_resource: Union[vkit.engine.char_sampler.type.CharSamplerEngineInitResource, NoneType] = None)
62    def __init__(
63        self,
64        init_config: CharSamplerFakerEngineInitConfig,
65        init_resource: Optional[CharSamplerFakerEngineInitResource] = None,
66    ):
67        super().__init__(init_config, init_resource)
68
69        assert init_resource
70        self.lexicon_collection = init_resource.lexicon_collection
71
72        self.methods = sorted(init_config.method_to_weight)
73        self.methods_probs = normalize_to_probs([
74            init_config.method_to_weight[method] for method in self.methods
75        ])
76
77        self.faker: Optional[Faker] = None
@classmethod
def get_type_name(cls) -> str:
58    @classmethod
59    def get_type_name(cls) -> str:
60        return 'faker'
def sample_from_faker(self, rng: numpy.random._generator.Generator):
79    def sample_from_faker(self, rng: RandomGenerator):
80        # Faker is not picklable, hence need a lazy initialization.
81        if self.faker is None:
82            self.faker = Faker(OrderedDict(self.init_config.local_to_weight))
83
84        while True:
85            method = rng_choice(rng, self.methods, probs=self.methods_probs)
86            seed: int = rng.bit_generator.state['state']['state']
87            for local in self.init_config.local_to_weight:
88                self.faker[local].seed(seed)
89
90            text = getattr(self.faker, method)()
91            segments: List[str] = []
92            for segment in text.split():
93                segment = ''.join(
94                    char for char in segment if self.lexicon_collection.has_char(char)
95                )
96                if segment:
97                    segments.append(segment)
98            if segments:
99                return ' '.join(segments)
def run( self, run_config: vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, rng: numpy.random._generator.Generator) -> Sequence[str]:
101    def run(self, run_config: CharSamplerEngineRunConfig, rng: RandomGenerator) -> Sequence[str]:
102        if not run_config.enable_aggregator_mode:
103            num_chars = run_config.num_chars
104
105            texts: List[str] = []
106            num_chars_in_texts = 0
107            while num_chars_in_texts + len(texts) - 1 < num_chars:
108                text = self.sample_from_faker(rng)
109                texts.append(text)
110                num_chars_in_texts += len(text)
111
112            chars = list(' '.join(texts))
113
114            # Trim and make sure the last char is not space.
115            if len(chars) > num_chars:
116                rest = chars[num_chars:]
117                chars = chars[:num_chars]
118                if chars[-1].isspace():
119                    chars.pop()
120                    assert not rest[0].isspace()
121                    chars.append(rest[0])
122
123            return chars
124
125        else:
126            return self.sample_from_faker(rng)