vkit.engine.char_sampler.faker
1# Copyright 2022 vkit-x Administrator. All Rights Reserved. 2# 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses. 4# 5# The commercial license gives you the full rights to create and distribute software 6# on your own terms without any SSPL license obligations. For more information, 7# please see the "LICENSE_COMMERCIAL.txt" file. 8# 9# This project is also available under Server Side Public License (SSPL). 10# The SSPL licensing is ideal for use cases such as open source projects with 11# SSPL distribution, student/academic purposes, hobby projects, internal research 12# projects without external distribution, or other projects where all SSPL 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file. 14from typing import Sequence, List, Mapping, Optional 15from collections import OrderedDict 16 17import attrs 18from numpy.random import Generator as RandomGenerator 19from faker import Faker 20 21from vkit.utility import rng_choice, normalize_to_probs 22from ..interface import Engine, EngineExecutorFactory 23from .type import CharSamplerEngineInitResource, CharSamplerEngineRunConfig 24 25 26@attrs.define 27class CharSamplerFakerEngineInitConfig: 28 local_to_weight: Mapping[str, float] = { 29 'zh_CN': 4, 30 'zh_TW': 1, 31 'en_US': 5, 32 } 33 method_to_weight: Mapping[str, float] = { 34 'address': 1, 35 'ascii_email': 1, 36 'dga': 1, 37 'uri': 1, 38 'word': 10, 39 'name': 1, 40 'country_calling_code': 1, 41 'phone_number': 1, 42 } 43 44 45CharSamplerFakerEngineInitResource = CharSamplerEngineInitResource 46 47 48class CharSamplerFakerEngine( 49 Engine[ 50 CharSamplerFakerEngineInitConfig, 51 CharSamplerFakerEngineInitResource, 52 CharSamplerEngineRunConfig, 53 Sequence[str], 54 ] 55): # yapf: disable 56 57 @classmethod 58 def get_type_name(cls) -> str: 59 return 'faker' 60 61 def __init__( 62 self, 63 init_config: CharSamplerFakerEngineInitConfig, 64 init_resource: Optional[CharSamplerFakerEngineInitResource] = None, 65 ): 66 super().__init__(init_config, init_resource) 67 68 assert init_resource 69 self.lexicon_collection = init_resource.lexicon_collection 70 71 self.methods = sorted(init_config.method_to_weight) 72 self.methods_probs = normalize_to_probs([ 73 init_config.method_to_weight[method] for method in self.methods 74 ]) 75 76 self.faker: Optional[Faker] = None 77 78 def sample_from_faker(self, rng: RandomGenerator): 79 # Faker is not picklable, hence need a lazy initialization. 80 if self.faker is None: 81 self.faker = Faker(OrderedDict(self.init_config.local_to_weight)) 82 83 while True: 84 method = rng_choice(rng, self.methods, probs=self.methods_probs) 85 seed: int = rng.bit_generator.state['state']['state'] 86 for local in self.init_config.local_to_weight: 87 self.faker[local].seed(seed) 88 89 text = getattr(self.faker, method)() 90 segments: List[str] = [] 91 for segment in text.split(): 92 segment = ''.join( 93 char for char in segment if self.lexicon_collection.has_char(char) 94 ) 95 if segment: 96 segments.append(segment) 97 if segments: 98 return ' '.join(segments) 99 100 def run( 101 self, 102 run_config: CharSamplerEngineRunConfig, 103 rng: Optional[RandomGenerator] = None, 104 ) -> Sequence[str]: 105 assert rng is not None 106 107 if not run_config.enable_aggregator_mode: 108 num_chars = run_config.num_chars 109 110 texts: List[str] = [] 111 num_chars_in_texts = 0 112 while num_chars_in_texts + len(texts) - 1 < num_chars: 113 text = self.sample_from_faker(rng) 114 texts.append(text) 115 num_chars_in_texts += len(text) 116 117 chars = list(' '.join(texts)) 118 119 # Trim and make sure the last char is not space. 120 if len(chars) > num_chars: 121 rest = chars[num_chars:] 122 chars = chars[:num_chars] 123 if chars[-1].isspace(): 124 chars.pop() 125 assert not rest[0].isspace() 126 chars.append(rest[0]) 127 128 return chars 129 130 else: 131 return self.sample_from_faker(rng) 132 133 134char_sampler_faker_engine_executor_factory = EngineExecutorFactory(CharSamplerFakerEngine)
class
CharSamplerFakerEngineInitConfig:
28class CharSamplerFakerEngineInitConfig: 29 local_to_weight: Mapping[str, float] = { 30 'zh_CN': 4, 31 'zh_TW': 1, 32 'en_US': 5, 33 } 34 method_to_weight: Mapping[str, float] = { 35 'address': 1, 36 'ascii_email': 1, 37 'dga': 1, 38 'uri': 1, 39 'word': 10, 40 'name': 1, 41 'country_calling_code': 1, 42 'phone_number': 1, 43 }
CharSamplerFakerEngineInitConfig( local_to_weight: Mapping[str, float] = {'zh_CN': 4, 'zh_TW': 1, 'en_US': 5}, method_to_weight: Mapping[str, float] = {'address': 1, 'ascii_email': 1, 'dga': 1, 'uri': 1, 'word': 10, 'name': 1, 'country_calling_code': 1, 'phone_number': 1})
2def __init__(self, local_to_weight=attr_dict['local_to_weight'].default, method_to_weight=attr_dict['method_to_weight'].default): 3 self.local_to_weight = local_to_weight 4 self.method_to_weight = method_to_weight
Method generated by attrs for class CharSamplerFakerEngineInitConfig.
class
CharSamplerFakerEngine(vkit.engine.interface.Engine[vkit.engine.char_sampler.faker.CharSamplerFakerEngineInitConfig, vkit.engine.char_sampler.type.CharSamplerEngineInitResource, vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, typing.Sequence[str]]):
49class CharSamplerFakerEngine( 50 Engine[ 51 CharSamplerFakerEngineInitConfig, 52 CharSamplerFakerEngineInitResource, 53 CharSamplerEngineRunConfig, 54 Sequence[str], 55 ] 56): # yapf: disable 57 58 @classmethod 59 def get_type_name(cls) -> str: 60 return 'faker' 61 62 def __init__( 63 self, 64 init_config: CharSamplerFakerEngineInitConfig, 65 init_resource: Optional[CharSamplerFakerEngineInitResource] = None, 66 ): 67 super().__init__(init_config, init_resource) 68 69 assert init_resource 70 self.lexicon_collection = init_resource.lexicon_collection 71 72 self.methods = sorted(init_config.method_to_weight) 73 self.methods_probs = normalize_to_probs([ 74 init_config.method_to_weight[method] for method in self.methods 75 ]) 76 77 self.faker: Optional[Faker] = None 78 79 def sample_from_faker(self, rng: RandomGenerator): 80 # Faker is not picklable, hence need a lazy initialization. 81 if self.faker is None: 82 self.faker = Faker(OrderedDict(self.init_config.local_to_weight)) 83 84 while True: 85 method = rng_choice(rng, self.methods, probs=self.methods_probs) 86 seed: int = rng.bit_generator.state['state']['state'] 87 for local in self.init_config.local_to_weight: 88 self.faker[local].seed(seed) 89 90 text = getattr(self.faker, method)() 91 segments: List[str] = [] 92 for segment in text.split(): 93 segment = ''.join( 94 char for char in segment if self.lexicon_collection.has_char(char) 95 ) 96 if segment: 97 segments.append(segment) 98 if segments: 99 return ' '.join(segments) 100 101 def run( 102 self, 103 run_config: CharSamplerEngineRunConfig, 104 rng: Optional[RandomGenerator] = None, 105 ) -> Sequence[str]: 106 assert rng is not None 107 108 if not run_config.enable_aggregator_mode: 109 num_chars = run_config.num_chars 110 111 texts: List[str] = [] 112 num_chars_in_texts = 0 113 while num_chars_in_texts + len(texts) - 1 < num_chars: 114 text = self.sample_from_faker(rng) 115 texts.append(text) 116 num_chars_in_texts += len(text) 117 118 chars = list(' '.join(texts)) 119 120 # Trim and make sure the last char is not space. 121 if len(chars) > num_chars: 122 rest = chars[num_chars:] 123 chars = chars[:num_chars] 124 if chars[-1].isspace(): 125 chars.pop() 126 assert not rest[0].isspace() 127 chars.append(rest[0]) 128 129 return chars 130 131 else: 132 return self.sample_from_faker(rng)
Abstract base class for generic types.
A generic type is typically declared by inheriting from this class parameterized with one or more type variables. For example, a generic mapping type might be defined as::
class Mapping(Generic[KT, VT]): def __getitem__(self, key: KT) -> VT: ... # Etc.
This class can then be used as follows::
def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT: try: return mapping[key] except KeyError: return default
CharSamplerFakerEngine( init_config: vkit.engine.char_sampler.faker.CharSamplerFakerEngineInitConfig, init_resource: Union[vkit.engine.char_sampler.type.CharSamplerEngineInitResource, NoneType] = None)
62 def __init__( 63 self, 64 init_config: CharSamplerFakerEngineInitConfig, 65 init_resource: Optional[CharSamplerFakerEngineInitResource] = None, 66 ): 67 super().__init__(init_config, init_resource) 68 69 assert init_resource 70 self.lexicon_collection = init_resource.lexicon_collection 71 72 self.methods = sorted(init_config.method_to_weight) 73 self.methods_probs = normalize_to_probs([ 74 init_config.method_to_weight[method] for method in self.methods 75 ]) 76 77 self.faker: Optional[Faker] = None
def
sample_from_faker(self, rng: numpy.random._generator.Generator):
79 def sample_from_faker(self, rng: RandomGenerator): 80 # Faker is not picklable, hence need a lazy initialization. 81 if self.faker is None: 82 self.faker = Faker(OrderedDict(self.init_config.local_to_weight)) 83 84 while True: 85 method = rng_choice(rng, self.methods, probs=self.methods_probs) 86 seed: int = rng.bit_generator.state['state']['state'] 87 for local in self.init_config.local_to_weight: 88 self.faker[local].seed(seed) 89 90 text = getattr(self.faker, method)() 91 segments: List[str] = [] 92 for segment in text.split(): 93 segment = ''.join( 94 char for char in segment if self.lexicon_collection.has_char(char) 95 ) 96 if segment: 97 segments.append(segment) 98 if segments: 99 return ' '.join(segments)
def
run( self, run_config: vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, rng: Union[numpy.random._generator.Generator, NoneType] = None) -> Sequence[str]:
101 def run( 102 self, 103 run_config: CharSamplerEngineRunConfig, 104 rng: Optional[RandomGenerator] = None, 105 ) -> Sequence[str]: 106 assert rng is not None 107 108 if not run_config.enable_aggregator_mode: 109 num_chars = run_config.num_chars 110 111 texts: List[str] = [] 112 num_chars_in_texts = 0 113 while num_chars_in_texts + len(texts) - 1 < num_chars: 114 text = self.sample_from_faker(rng) 115 texts.append(text) 116 num_chars_in_texts += len(text) 117 118 chars = list(' '.join(texts)) 119 120 # Trim and make sure the last char is not space. 121 if len(chars) > num_chars: 122 rest = chars[num_chars:] 123 chars = chars[:num_chars] 124 if chars[-1].isspace(): 125 chars.pop() 126 assert not rest[0].isspace() 127 chars.append(rest[0]) 128 129 return chars 130 131 else: 132 return self.sample_from_faker(rng)