vkit.engine.char_sampler.faker
1# Copyright 2022 vkit-x Administrator. All Rights Reserved. 2# 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses. 4# 5# The commercial license gives you the full rights to create and distribute software 6# on your own terms without any SSPL license obligations. For more information, 7# please see the "LICENSE_COMMERCIAL.txt" file. 8# 9# This project is also available under Server Side Public License (SSPL). 10# The SSPL licensing is ideal for use cases such as open source projects with 11# SSPL distribution, student/academic purposes, hobby projects, internal research 12# projects without external distribution, or other projects where all SSPL 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file. 14from typing import Sequence, List, Mapping, Optional 15from collections import OrderedDict 16 17import attrs 18from numpy.random import Generator as RandomGenerator 19from faker import Faker 20 21from vkit.utility import rng_choice, normalize_to_probs 22from vkit.engine.interface import Engine, EngineExecutorFactory 23from .type import CharSamplerEngineInitResource, CharSamplerEngineRunConfig 24 25 26@attrs.define 27class CharSamplerFakerEngineInitConfig: 28 local_to_weight: Mapping[str, float] = { 29 'zh_CN': 4, 30 'zh_TW': 1, 31 'en_US': 5, 32 } 33 method_to_weight: Mapping[str, float] = { 34 'address': 1, 35 'ascii_email': 1, 36 'dga': 1, 37 'uri': 1, 38 'word': 10, 39 'name': 1, 40 'country_calling_code': 1, 41 'phone_number': 1, 42 } 43 44 45CharSamplerFakerEngineInitResource = CharSamplerEngineInitResource 46 47 48class CharSamplerFakerEngine( 49 Engine[ 50 CharSamplerFakerEngineInitConfig, 51 CharSamplerFakerEngineInitResource, 52 CharSamplerEngineRunConfig, 53 Sequence[str], 54 ] 55): # yapf: disable 56 57 @classmethod 58 def get_type_name(cls) -> str: 59 return 'faker' 60 61 def __init__( 62 self, 63 init_config: CharSamplerFakerEngineInitConfig, 64 init_resource: Optional[CharSamplerFakerEngineInitResource] = None, 65 ): 66 super().__init__(init_config, init_resource) 67 68 assert init_resource 69 self.lexicon_collection = init_resource.lexicon_collection 70 71 self.methods = sorted(init_config.method_to_weight) 72 self.methods_probs = normalize_to_probs([ 73 init_config.method_to_weight[method] for method in self.methods 74 ]) 75 76 self.faker: Optional[Faker] = None 77 78 def sample_from_faker(self, rng: RandomGenerator): 79 # Faker is not picklable, hence need a lazy initialization. 80 if self.faker is None: 81 self.faker = Faker(OrderedDict(self.init_config.local_to_weight)) 82 83 while True: 84 method = rng_choice(rng, self.methods, probs=self.methods_probs) 85 seed: int = rng.bit_generator.state['state']['state'] 86 for local in self.init_config.local_to_weight: 87 self.faker[local].seed(seed) 88 89 text = getattr(self.faker, method)() 90 segments: List[str] = [] 91 for segment in text.split(): 92 segment = ''.join( 93 char for char in segment if self.lexicon_collection.has_char(char) 94 ) 95 if segment: 96 segments.append(segment) 97 if segments: 98 return ' '.join(segments) 99 100 def run(self, run_config: CharSamplerEngineRunConfig, rng: RandomGenerator) -> Sequence[str]: 101 if not run_config.enable_aggregator_mode: 102 num_chars = run_config.num_chars 103 104 texts: List[str] = [] 105 num_chars_in_texts = 0 106 while num_chars_in_texts + len(texts) - 1 < num_chars: 107 text = self.sample_from_faker(rng) 108 texts.append(text) 109 num_chars_in_texts += len(text) 110 111 chars = list(' '.join(texts)) 112 113 # Trim and make sure the last char is not space. 114 if len(chars) > num_chars: 115 rest = chars[num_chars:] 116 chars = chars[:num_chars] 117 if chars[-1].isspace(): 118 chars.pop() 119 assert not rest[0].isspace() 120 chars.append(rest[0]) 121 122 return chars 123 124 else: 125 return self.sample_from_faker(rng) 126 127 128char_sampler_faker_engine_executor_factory = EngineExecutorFactory(CharSamplerFakerEngine)
class
CharSamplerFakerEngineInitConfig:
28class CharSamplerFakerEngineInitConfig: 29 local_to_weight: Mapping[str, float] = { 30 'zh_CN': 4, 31 'zh_TW': 1, 32 'en_US': 5, 33 } 34 method_to_weight: Mapping[str, float] = { 35 'address': 1, 36 'ascii_email': 1, 37 'dga': 1, 38 'uri': 1, 39 'word': 10, 40 'name': 1, 41 'country_calling_code': 1, 42 'phone_number': 1, 43 }
CharSamplerFakerEngineInitConfig( local_to_weight: Mapping[str, float] = {'zh_CN': 4, 'zh_TW': 1, 'en_US': 5}, method_to_weight: Mapping[str, float] = {'address': 1, 'ascii_email': 1, 'dga': 1, 'uri': 1, 'word': 10, 'name': 1, 'country_calling_code': 1, 'phone_number': 1})
2def __init__(self, local_to_weight=attr_dict['local_to_weight'].default, method_to_weight=attr_dict['method_to_weight'].default): 3 self.local_to_weight = local_to_weight 4 self.method_to_weight = method_to_weight
Method generated by attrs for class CharSamplerFakerEngineInitConfig.
class
CharSamplerFakerEngine(vkit.engine.interface.Engine[vkit.engine.char_sampler.faker.CharSamplerFakerEngineInitConfig, vkit.engine.char_sampler.type.CharSamplerEngineInitResource, vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, typing.Sequence[str]]):
49class CharSamplerFakerEngine( 50 Engine[ 51 CharSamplerFakerEngineInitConfig, 52 CharSamplerFakerEngineInitResource, 53 CharSamplerEngineRunConfig, 54 Sequence[str], 55 ] 56): # yapf: disable 57 58 @classmethod 59 def get_type_name(cls) -> str: 60 return 'faker' 61 62 def __init__( 63 self, 64 init_config: CharSamplerFakerEngineInitConfig, 65 init_resource: Optional[CharSamplerFakerEngineInitResource] = None, 66 ): 67 super().__init__(init_config, init_resource) 68 69 assert init_resource 70 self.lexicon_collection = init_resource.lexicon_collection 71 72 self.methods = sorted(init_config.method_to_weight) 73 self.methods_probs = normalize_to_probs([ 74 init_config.method_to_weight[method] for method in self.methods 75 ]) 76 77 self.faker: Optional[Faker] = None 78 79 def sample_from_faker(self, rng: RandomGenerator): 80 # Faker is not picklable, hence need a lazy initialization. 81 if self.faker is None: 82 self.faker = Faker(OrderedDict(self.init_config.local_to_weight)) 83 84 while True: 85 method = rng_choice(rng, self.methods, probs=self.methods_probs) 86 seed: int = rng.bit_generator.state['state']['state'] 87 for local in self.init_config.local_to_weight: 88 self.faker[local].seed(seed) 89 90 text = getattr(self.faker, method)() 91 segments: List[str] = [] 92 for segment in text.split(): 93 segment = ''.join( 94 char for char in segment if self.lexicon_collection.has_char(char) 95 ) 96 if segment: 97 segments.append(segment) 98 if segments: 99 return ' '.join(segments) 100 101 def run(self, run_config: CharSamplerEngineRunConfig, rng: RandomGenerator) -> Sequence[str]: 102 if not run_config.enable_aggregator_mode: 103 num_chars = run_config.num_chars 104 105 texts: List[str] = [] 106 num_chars_in_texts = 0 107 while num_chars_in_texts + len(texts) - 1 < num_chars: 108 text = self.sample_from_faker(rng) 109 texts.append(text) 110 num_chars_in_texts += len(text) 111 112 chars = list(' '.join(texts)) 113 114 # Trim and make sure the last char is not space. 115 if len(chars) > num_chars: 116 rest = chars[num_chars:] 117 chars = chars[:num_chars] 118 if chars[-1].isspace(): 119 chars.pop() 120 assert not rest[0].isspace() 121 chars.append(rest[0]) 122 123 return chars 124 125 else: 126 return self.sample_from_faker(rng)
Abstract base class for generic types.
A generic type is typically declared by inheriting from this class parameterized with one or more type variables. For example, a generic mapping type might be defined as::
class Mapping(Generic[KT, VT]): def __getitem__(self, key: KT) -> VT: ... # Etc.
This class can then be used as follows::
def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT: try: return mapping[key] except KeyError: return default
CharSamplerFakerEngine( init_config: vkit.engine.char_sampler.faker.CharSamplerFakerEngineInitConfig, init_resource: Union[vkit.engine.char_sampler.type.CharSamplerEngineInitResource, NoneType] = None)
62 def __init__( 63 self, 64 init_config: CharSamplerFakerEngineInitConfig, 65 init_resource: Optional[CharSamplerFakerEngineInitResource] = None, 66 ): 67 super().__init__(init_config, init_resource) 68 69 assert init_resource 70 self.lexicon_collection = init_resource.lexicon_collection 71 72 self.methods = sorted(init_config.method_to_weight) 73 self.methods_probs = normalize_to_probs([ 74 init_config.method_to_weight[method] for method in self.methods 75 ]) 76 77 self.faker: Optional[Faker] = None
def
sample_from_faker(self, rng: numpy.random._generator.Generator):
79 def sample_from_faker(self, rng: RandomGenerator): 80 # Faker is not picklable, hence need a lazy initialization. 81 if self.faker is None: 82 self.faker = Faker(OrderedDict(self.init_config.local_to_weight)) 83 84 while True: 85 method = rng_choice(rng, self.methods, probs=self.methods_probs) 86 seed: int = rng.bit_generator.state['state']['state'] 87 for local in self.init_config.local_to_weight: 88 self.faker[local].seed(seed) 89 90 text = getattr(self.faker, method)() 91 segments: List[str] = [] 92 for segment in text.split(): 93 segment = ''.join( 94 char for char in segment if self.lexicon_collection.has_char(char) 95 ) 96 if segment: 97 segments.append(segment) 98 if segments: 99 return ' '.join(segments)
def
run( self, run_config: vkit.engine.char_sampler.type.CharSamplerEngineRunConfig, rng: numpy.random._generator.Generator) -> Sequence[str]:
101 def run(self, run_config: CharSamplerEngineRunConfig, rng: RandomGenerator) -> Sequence[str]: 102 if not run_config.enable_aggregator_mode: 103 num_chars = run_config.num_chars 104 105 texts: List[str] = [] 106 num_chars_in_texts = 0 107 while num_chars_in_texts + len(texts) - 1 < num_chars: 108 text = self.sample_from_faker(rng) 109 texts.append(text) 110 num_chars_in_texts += len(text) 111 112 chars = list(' '.join(texts)) 113 114 # Trim and make sure the last char is not space. 115 if len(chars) > num_chars: 116 rest = chars[num_chars:] 117 chars = chars[:num_chars] 118 if chars[-1].isspace(): 119 chars.pop() 120 assert not rest[0].isspace() 121 chars.append(rest[0]) 122 123 return chars 124 125 else: 126 return self.sample_from_faker(rng)