vkit.pipeline.text_detection.page_resizing
1# Copyright 2022 vkit-x Administrator. All Rights Reserved. 2# 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses. 4# 5# The commercial license gives you the full rights to create and distribute software 6# on your own terms without any SSPL license obligations. For more information, 7# please see the "LICENSE_COMMERCIAL.txt" file. 8# 9# This project is also available under Server Side Public License (SSPL). 10# The SSPL licensing is ideal for use cases such as open source projects with 11# SSPL distribution, student/academic purposes, hobby projects, internal research 12# projects without external distribution, or other projects where all SSPL 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file. 14from typing import Sequence 15import logging 16 17import attrs 18from numpy.random import Generator as RandomGenerator 19import numpy as np 20 21from vkit.utility import sample_cv_resize_interpolation 22from vkit.element import Mask, ScoreMap, Image 23from .page_distortion import PageDistortionStepOutput 24from ..interface import PipelineStep, PipelineStepFactory 25 26logger = logging.getLogger(__name__) 27 28 29@attrs.define 30class PageResizingStepConfig: 31 resized_text_line_height_min: float = 3.0 32 resized_text_line_height_max: float = 10.0 33 text_line_heights_filtering_thr: float = 1.0 34 35 36@attrs.define 37class PageResizingStepInput: 38 page_distortion_step_output: PageDistortionStepOutput 39 40 41@attrs.define 42class PageResizingStepOutput: 43 page_image: Image 44 page_active_mask: Mask 45 page_char_mask: Mask 46 page_char_height_score_map: ScoreMap 47 page_text_line_mask: Mask 48 page_text_line_height_score_map: ScoreMap 49 50 51class PageResizingStep( 52 PipelineStep[ 53 PageResizingStepConfig, 54 PageResizingStepInput, 55 PageResizingStepOutput, 56 ] 57): # yapf: disable 58 59 def __init__(self, config: PageResizingStepConfig): 60 super().__init__(config) 61 62 def get_text_line_heights_min(self, page_distorted_text_line_heights: Sequence[float]): 63 # 1. Filtering. 64 text_line_heights = [ 65 text_line_height for text_line_height in page_distorted_text_line_heights 66 if text_line_height > self.config.text_line_heights_filtering_thr 67 ] 68 assert text_line_heights 69 # 2. Remove outliers. 70 # https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm 71 text_line_heights = np.asarray(text_line_heights) 72 deltas = np.abs(text_line_heights - np.median(text_line_heights)) 73 deltas_median = np.median(deltas) 74 delta_ratios = deltas / (deltas_median or 1.0) 75 text_line_heights_min = float( 76 min( 77 text_line_height 78 for text_line_height, delta_ratio in zip(text_line_heights, delta_ratios) 79 if delta_ratio < 3.5 80 ) 81 ) 82 return text_line_heights_min 83 84 def run(self, input: PageResizingStepInput, rng: RandomGenerator): 85 page_distortion_step_output = input.page_distortion_step_output 86 page_image = page_distortion_step_output.page_image 87 page_active_mask = page_distortion_step_output.page_active_mask 88 89 page_char_mask = page_distortion_step_output.page_char_mask 90 assert page_char_mask 91 92 page_char_height_score_map = page_distortion_step_output.page_char_height_score_map 93 assert page_char_height_score_map 94 95 page_text_line_mask = page_distortion_step_output.page_text_line_mask 96 assert page_text_line_mask 97 98 page_text_line_height_score_map = \ 99 page_distortion_step_output.page_text_line_height_score_map 100 assert page_text_line_height_score_map 101 102 page_distorted_text_line_heights = page_distortion_step_output.page_text_line_heights 103 assert page_distorted_text_line_heights 104 105 # Resizing. 106 height, width = page_image.shape 107 text_line_heights_min = self.get_text_line_heights_min(page_distorted_text_line_heights) 108 logger.debug(f'text_line_heights_min={text_line_heights_min}') 109 resized_text_line_height = rng.uniform( 110 self.config.resized_text_line_height_min, 111 self.config.resized_text_line_height_max, 112 ) 113 resize_ratio = resized_text_line_height / text_line_heights_min 114 115 resized_height = round(resize_ratio * height) 116 resized_width = round(resize_ratio * width) 117 118 cv_resize_interpolation = sample_cv_resize_interpolation( 119 rng, 120 include_cv_inter_area=(resize_ratio < 1.0), 121 ) 122 logger.debug(f'cv_resize_interpolation={cv_resize_interpolation}') 123 124 page_image = page_image.to_resized_image( 125 resized_height=resized_height, 126 resized_width=resized_width, 127 cv_resize_interpolation=cv_resize_interpolation, 128 ) 129 130 assert page_active_mask.shape == (height, width) 131 page_active_mask = page_active_mask.to_resized_mask( 132 resized_height=resized_height, 133 resized_width=resized_width, 134 cv_resize_interpolation=cv_resize_interpolation, 135 ) 136 137 assert page_char_mask.shape == (height, width) 138 page_char_mask = page_char_mask.to_resized_mask( 139 resized_height=resized_height, 140 resized_width=resized_width, 141 cv_resize_interpolation=cv_resize_interpolation, 142 ) 143 144 assert page_char_height_score_map.shape == (height, width) 145 page_char_height_score_map = page_char_height_score_map.to_resized_score_map( 146 resized_height=resized_height, 147 resized_width=resized_width, 148 cv_resize_interpolation=cv_resize_interpolation, 149 ) 150 # Scores are resized as well. 151 page_char_height_score_map.assign_mat(page_char_height_score_map.mat * resize_ratio) 152 153 assert page_text_line_mask.shape == (height, width) 154 page_text_line_mask = page_text_line_mask.to_resized_mask( 155 resized_height=resized_height, 156 resized_width=resized_width, 157 cv_resize_interpolation=cv_resize_interpolation, 158 ) 159 160 assert page_text_line_height_score_map.shape == (height, width) 161 page_text_line_height_score_map = page_text_line_height_score_map.to_resized_score_map( 162 resized_height=resized_height, 163 resized_width=resized_width, 164 cv_resize_interpolation=cv_resize_interpolation, 165 ) 166 # Scores are resized as well. 167 page_text_line_height_score_map.assign_mat( 168 page_text_line_height_score_map.mat * resize_ratio 169 ) 170 171 return PageResizingStepOutput( 172 page_image=page_image, 173 page_active_mask=page_active_mask, 174 page_char_mask=page_char_mask, 175 page_char_height_score_map=page_char_height_score_map, 176 page_text_line_mask=page_text_line_mask, 177 page_text_line_height_score_map=page_text_line_height_score_map, 178 ) 179 180 181page_resizing_step_factory = PipelineStepFactory(PageResizingStep)
class
PageResizingStepConfig:
31class PageResizingStepConfig: 32 resized_text_line_height_min: float = 3.0 33 resized_text_line_height_max: float = 10.0 34 text_line_heights_filtering_thr: float = 1.0
PageResizingStepConfig( resized_text_line_height_min: float = 3.0, resized_text_line_height_max: float = 10.0, text_line_heights_filtering_thr: float = 1.0)
2def __init__(self, resized_text_line_height_min=attr_dict['resized_text_line_height_min'].default, resized_text_line_height_max=attr_dict['resized_text_line_height_max'].default, text_line_heights_filtering_thr=attr_dict['text_line_heights_filtering_thr'].default): 3 self.resized_text_line_height_min = resized_text_line_height_min 4 self.resized_text_line_height_max = resized_text_line_height_max 5 self.text_line_heights_filtering_thr = text_line_heights_filtering_thr
Method generated by attrs for class PageResizingStepConfig.
class
PageResizingStepInput:
PageResizingStepInput( page_distortion_step_output: vkit.pipeline.text_detection.page_distortion.PageDistortionStepOutput)
2def __init__(self, page_distortion_step_output): 3 self.page_distortion_step_output = page_distortion_step_output
Method generated by attrs for class PageResizingStepInput.
class
PageResizingStepOutput:
43class PageResizingStepOutput: 44 page_image: Image 45 page_active_mask: Mask 46 page_char_mask: Mask 47 page_char_height_score_map: ScoreMap 48 page_text_line_mask: Mask 49 page_text_line_height_score_map: ScoreMap
PageResizingStepOutput( page_image: vkit.element.image.Image, page_active_mask: vkit.element.mask.Mask, page_char_mask: vkit.element.mask.Mask, page_char_height_score_map: vkit.element.score_map.ScoreMap, page_text_line_mask: vkit.element.mask.Mask, page_text_line_height_score_map: vkit.element.score_map.ScoreMap)
2def __init__(self, page_image, page_active_mask, page_char_mask, page_char_height_score_map, page_text_line_mask, page_text_line_height_score_map): 3 self.page_image = page_image 4 self.page_active_mask = page_active_mask 5 self.page_char_mask = page_char_mask 6 self.page_char_height_score_map = page_char_height_score_map 7 self.page_text_line_mask = page_text_line_mask 8 self.page_text_line_height_score_map = page_text_line_height_score_map
Method generated by attrs for class PageResizingStepOutput.
52class PageResizingStep( 53 PipelineStep[ 54 PageResizingStepConfig, 55 PageResizingStepInput, 56 PageResizingStepOutput, 57 ] 58): # yapf: disable 59 60 def __init__(self, config: PageResizingStepConfig): 61 super().__init__(config) 62 63 def get_text_line_heights_min(self, page_distorted_text_line_heights: Sequence[float]): 64 # 1. Filtering. 65 text_line_heights = [ 66 text_line_height for text_line_height in page_distorted_text_line_heights 67 if text_line_height > self.config.text_line_heights_filtering_thr 68 ] 69 assert text_line_heights 70 # 2. Remove outliers. 71 # https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm 72 text_line_heights = np.asarray(text_line_heights) 73 deltas = np.abs(text_line_heights - np.median(text_line_heights)) 74 deltas_median = np.median(deltas) 75 delta_ratios = deltas / (deltas_median or 1.0) 76 text_line_heights_min = float( 77 min( 78 text_line_height 79 for text_line_height, delta_ratio in zip(text_line_heights, delta_ratios) 80 if delta_ratio < 3.5 81 ) 82 ) 83 return text_line_heights_min 84 85 def run(self, input: PageResizingStepInput, rng: RandomGenerator): 86 page_distortion_step_output = input.page_distortion_step_output 87 page_image = page_distortion_step_output.page_image 88 page_active_mask = page_distortion_step_output.page_active_mask 89 90 page_char_mask = page_distortion_step_output.page_char_mask 91 assert page_char_mask 92 93 page_char_height_score_map = page_distortion_step_output.page_char_height_score_map 94 assert page_char_height_score_map 95 96 page_text_line_mask = page_distortion_step_output.page_text_line_mask 97 assert page_text_line_mask 98 99 page_text_line_height_score_map = \ 100 page_distortion_step_output.page_text_line_height_score_map 101 assert page_text_line_height_score_map 102 103 page_distorted_text_line_heights = page_distortion_step_output.page_text_line_heights 104 assert page_distorted_text_line_heights 105 106 # Resizing. 107 height, width = page_image.shape 108 text_line_heights_min = self.get_text_line_heights_min(page_distorted_text_line_heights) 109 logger.debug(f'text_line_heights_min={text_line_heights_min}') 110 resized_text_line_height = rng.uniform( 111 self.config.resized_text_line_height_min, 112 self.config.resized_text_line_height_max, 113 ) 114 resize_ratio = resized_text_line_height / text_line_heights_min 115 116 resized_height = round(resize_ratio * height) 117 resized_width = round(resize_ratio * width) 118 119 cv_resize_interpolation = sample_cv_resize_interpolation( 120 rng, 121 include_cv_inter_area=(resize_ratio < 1.0), 122 ) 123 logger.debug(f'cv_resize_interpolation={cv_resize_interpolation}') 124 125 page_image = page_image.to_resized_image( 126 resized_height=resized_height, 127 resized_width=resized_width, 128 cv_resize_interpolation=cv_resize_interpolation, 129 ) 130 131 assert page_active_mask.shape == (height, width) 132 page_active_mask = page_active_mask.to_resized_mask( 133 resized_height=resized_height, 134 resized_width=resized_width, 135 cv_resize_interpolation=cv_resize_interpolation, 136 ) 137 138 assert page_char_mask.shape == (height, width) 139 page_char_mask = page_char_mask.to_resized_mask( 140 resized_height=resized_height, 141 resized_width=resized_width, 142 cv_resize_interpolation=cv_resize_interpolation, 143 ) 144 145 assert page_char_height_score_map.shape == (height, width) 146 page_char_height_score_map = page_char_height_score_map.to_resized_score_map( 147 resized_height=resized_height, 148 resized_width=resized_width, 149 cv_resize_interpolation=cv_resize_interpolation, 150 ) 151 # Scores are resized as well. 152 page_char_height_score_map.assign_mat(page_char_height_score_map.mat * resize_ratio) 153 154 assert page_text_line_mask.shape == (height, width) 155 page_text_line_mask = page_text_line_mask.to_resized_mask( 156 resized_height=resized_height, 157 resized_width=resized_width, 158 cv_resize_interpolation=cv_resize_interpolation, 159 ) 160 161 assert page_text_line_height_score_map.shape == (height, width) 162 page_text_line_height_score_map = page_text_line_height_score_map.to_resized_score_map( 163 resized_height=resized_height, 164 resized_width=resized_width, 165 cv_resize_interpolation=cv_resize_interpolation, 166 ) 167 # Scores are resized as well. 168 page_text_line_height_score_map.assign_mat( 169 page_text_line_height_score_map.mat * resize_ratio 170 ) 171 172 return PageResizingStepOutput( 173 page_image=page_image, 174 page_active_mask=page_active_mask, 175 page_char_mask=page_char_mask, 176 page_char_height_score_map=page_char_height_score_map, 177 page_text_line_mask=page_text_line_mask, 178 page_text_line_height_score_map=page_text_line_height_score_map, 179 )
Abstract base class for generic types.
A generic type is typically declared by inheriting from this class parameterized with one or more type variables. For example, a generic mapping type might be defined as::
class Mapping(Generic[KT, VT]): def __getitem__(self, key: KT) -> VT: ... # Etc.
This class can then be used as follows::
def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT: try: return mapping[key] except KeyError: return default
PageResizingStep( config: vkit.pipeline.text_detection.page_resizing.PageResizingStepConfig)
def
get_text_line_heights_min(self, page_distorted_text_line_heights: Sequence[float]):
63 def get_text_line_heights_min(self, page_distorted_text_line_heights: Sequence[float]): 64 # 1. Filtering. 65 text_line_heights = [ 66 text_line_height for text_line_height in page_distorted_text_line_heights 67 if text_line_height > self.config.text_line_heights_filtering_thr 68 ] 69 assert text_line_heights 70 # 2. Remove outliers. 71 # https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm 72 text_line_heights = np.asarray(text_line_heights) 73 deltas = np.abs(text_line_heights - np.median(text_line_heights)) 74 deltas_median = np.median(deltas) 75 delta_ratios = deltas / (deltas_median or 1.0) 76 text_line_heights_min = float( 77 min( 78 text_line_height 79 for text_line_height, delta_ratio in zip(text_line_heights, delta_ratios) 80 if delta_ratio < 3.5 81 ) 82 ) 83 return text_line_heights_min
def
run( self, input: vkit.pipeline.text_detection.page_resizing.PageResizingStepInput, rng: numpy.random._generator.Generator):
85 def run(self, input: PageResizingStepInput, rng: RandomGenerator): 86 page_distortion_step_output = input.page_distortion_step_output 87 page_image = page_distortion_step_output.page_image 88 page_active_mask = page_distortion_step_output.page_active_mask 89 90 page_char_mask = page_distortion_step_output.page_char_mask 91 assert page_char_mask 92 93 page_char_height_score_map = page_distortion_step_output.page_char_height_score_map 94 assert page_char_height_score_map 95 96 page_text_line_mask = page_distortion_step_output.page_text_line_mask 97 assert page_text_line_mask 98 99 page_text_line_height_score_map = \ 100 page_distortion_step_output.page_text_line_height_score_map 101 assert page_text_line_height_score_map 102 103 page_distorted_text_line_heights = page_distortion_step_output.page_text_line_heights 104 assert page_distorted_text_line_heights 105 106 # Resizing. 107 height, width = page_image.shape 108 text_line_heights_min = self.get_text_line_heights_min(page_distorted_text_line_heights) 109 logger.debug(f'text_line_heights_min={text_line_heights_min}') 110 resized_text_line_height = rng.uniform( 111 self.config.resized_text_line_height_min, 112 self.config.resized_text_line_height_max, 113 ) 114 resize_ratio = resized_text_line_height / text_line_heights_min 115 116 resized_height = round(resize_ratio * height) 117 resized_width = round(resize_ratio * width) 118 119 cv_resize_interpolation = sample_cv_resize_interpolation( 120 rng, 121 include_cv_inter_area=(resize_ratio < 1.0), 122 ) 123 logger.debug(f'cv_resize_interpolation={cv_resize_interpolation}') 124 125 page_image = page_image.to_resized_image( 126 resized_height=resized_height, 127 resized_width=resized_width, 128 cv_resize_interpolation=cv_resize_interpolation, 129 ) 130 131 assert page_active_mask.shape == (height, width) 132 page_active_mask = page_active_mask.to_resized_mask( 133 resized_height=resized_height, 134 resized_width=resized_width, 135 cv_resize_interpolation=cv_resize_interpolation, 136 ) 137 138 assert page_char_mask.shape == (height, width) 139 page_char_mask = page_char_mask.to_resized_mask( 140 resized_height=resized_height, 141 resized_width=resized_width, 142 cv_resize_interpolation=cv_resize_interpolation, 143 ) 144 145 assert page_char_height_score_map.shape == (height, width) 146 page_char_height_score_map = page_char_height_score_map.to_resized_score_map( 147 resized_height=resized_height, 148 resized_width=resized_width, 149 cv_resize_interpolation=cv_resize_interpolation, 150 ) 151 # Scores are resized as well. 152 page_char_height_score_map.assign_mat(page_char_height_score_map.mat * resize_ratio) 153 154 assert page_text_line_mask.shape == (height, width) 155 page_text_line_mask = page_text_line_mask.to_resized_mask( 156 resized_height=resized_height, 157 resized_width=resized_width, 158 cv_resize_interpolation=cv_resize_interpolation, 159 ) 160 161 assert page_text_line_height_score_map.shape == (height, width) 162 page_text_line_height_score_map = page_text_line_height_score_map.to_resized_score_map( 163 resized_height=resized_height, 164 resized_width=resized_width, 165 cv_resize_interpolation=cv_resize_interpolation, 166 ) 167 # Scores are resized as well. 168 page_text_line_height_score_map.assign_mat( 169 page_text_line_height_score_map.mat * resize_ratio 170 ) 171 172 return PageResizingStepOutput( 173 page_image=page_image, 174 page_active_mask=page_active_mask, 175 page_char_mask=page_char_mask, 176 page_char_height_score_map=page_char_height_score_map, 177 page_text_line_mask=page_text_line_mask, 178 page_text_line_height_score_map=page_text_line_height_score_map, 179 )