vkit.pipeline.text_detection.page_text_region
1# Copyright 2022 vkit-x Administrator. All Rights Reserved. 2# 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses. 4# 5# The commercial license gives you the full rights to create and distribute software 6# on your own terms without any SSPL license obligations. For more information, 7# please see the "LICENSE_COMMERCIAL.txt" file. 8# 9# This project is also available under Server Side Public License (SSPL). 10# The SSPL licensing is ideal for use cases such as open source projects with 11# SSPL distribution, student/academic purposes, hobby projects, internal research 12# projects without external distribution, or other projects where all SSPL 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file. 14from typing import List, Optional, Dict, DefaultDict, Sequence, Tuple, Set 15from collections import defaultdict 16import itertools 17import math 18import statistics 19import logging 20import warnings 21 22import attrs 23from numpy.random import Generator as RandomGenerator 24import numpy as np 25from shapely.errors import ShapelyDeprecationWarning 26from shapely.strtree import STRtree 27from shapely.geometry import Polygon as ShapelyPolygon 28from rectpack import newPacker as RectPacker 29 30from vkit.utility import rng_choice, rng_choice_with_size 31from vkit.element import Box, Polygon, Mask, Image, ElementSetOperationMode 32from vkit.mechanism.distortion import rotate 33from ..interface import PipelineStep, PipelineStepFactory 34from .page_distortion import PageDistortionStepOutput 35from .page_resizing import PageResizingStepOutput 36 37logger = logging.getLogger(__name__) 38 39# Shapely version has been explicitly locked under 2.0, hence ignore this warning. 40warnings.filterwarnings('ignore', category=ShapelyDeprecationWarning) 41 42 43@attrs.define 44class PageTextRegionStepConfig: 45 text_region_flattener_typical_long_side_ratio_min: float = 3.0 46 text_region_flattener_text_region_polygon_dilate_ratio_min: float = 0.85 47 text_region_flattener_text_region_polygon_dilate_ratio_max: float = 1.0 48 text_region_resize_char_height_median_min: int = 30 49 text_region_resize_char_height_median_max: int = 45 50 text_region_typical_post_rotate_prob: float = 0.2 51 text_region_untypical_post_rotate_prob: float = 0.2 52 negative_text_region_ratio: float = 0.1 53 negative_text_region_post_rotate_prob: float = 0.2 54 stack_flattened_text_regions_pad: int = 2 55 enable_post_rotate: bool = False 56 post_rotate_angle_min: int = -10 57 post_rotate_angle_max: int = 10 58 enable_debug: bool = False 59 60 61@attrs.define 62class PageTextRegionStepInput: 63 page_distortion_step_output: PageDistortionStepOutput 64 page_resizing_step_output: PageResizingStepOutput 65 66 67@attrs.define 68class PageTextRegionInfo: 69 precise_text_region_polygon: Polygon 70 char_polygons: Sequence[Polygon] 71 72 73@attrs.define 74class FlattenedTextRegion: 75 is_typical: bool 76 text_region_polygon: Polygon 77 text_region_image: Image 78 bounding_extended_text_region_mask: Mask 79 flattening_rotate_angle: int 80 shape_before_trim: Tuple[int, int] 81 rotated_trimmed_box: Box 82 shape_before_resize: Tuple[int, int] 83 post_rotate_angle: int 84 flattened_image: Image 85 flattened_mask: Mask 86 flattened_char_polygons: Optional[Sequence[Polygon]] 87 88 @property 89 def shape(self): 90 return self.flattened_image.shape 91 92 @property 93 def height(self): 94 return self.flattened_image.height 95 96 @property 97 def width(self): 98 return self.flattened_image.width 99 100 @property 101 def area(self): 102 return self.flattened_image.area 103 104 def get_char_height_meidan(self): 105 assert self.flattened_char_polygons 106 return statistics.median( 107 char_polygon.get_rectangular_height() for char_polygon in self.flattened_char_polygons 108 ) 109 110 def to_resized_flattened_text_region( 111 self, 112 resized_height: Optional[int] = None, 113 resized_width: Optional[int] = None, 114 ): 115 resized_flattened_image = self.flattened_image.to_resized_image( 116 resized_height=resized_height, 117 resized_width=resized_width, 118 ) 119 120 resized_flattened_mask = self.flattened_mask.to_resized_mask( 121 resized_height=resized_height, 122 resized_width=resized_width, 123 ) 124 125 resized_flattened_char_polygons = None 126 if self.flattened_char_polygons is not None: 127 resized_flattened_char_polygons = [ 128 flattened_char_polygon.to_conducted_resized_polygon( 129 self.shape, 130 resized_height=resized_height, 131 resized_width=resized_width, 132 ) for flattened_char_polygon in self.flattened_char_polygons 133 ] 134 135 return attrs.evolve( 136 self, 137 flattened_image=resized_flattened_image, 138 flattened_mask=resized_flattened_mask, 139 flattened_char_polygons=resized_flattened_char_polygons, 140 ) 141 142 def to_post_rotated_flattened_text_region( 143 self, 144 post_rotate_angle: int, 145 ): 146 assert self.post_rotate_angle == 0 147 148 # NOTE: No need to trim. 149 rotated_result = rotate.distort( 150 {'angle': post_rotate_angle}, 151 image=self.flattened_image, 152 mask=self.flattened_mask, 153 polygons=self.flattened_char_polygons, 154 ) 155 rotated_flattened_image = rotated_result.image 156 assert rotated_flattened_image 157 rotated_flattened_mask = rotated_result.mask 158 assert rotated_flattened_mask 159 rotated_flattened_char_polygons = rotated_result.polygons 160 161 return attrs.evolve( 162 self, 163 post_rotate_angle=post_rotate_angle, 164 flattened_image=rotated_flattened_image, 165 flattened_mask=rotated_flattened_mask, 166 flattened_char_polygons=rotated_flattened_char_polygons, 167 ) 168 169 170@attrs.define 171class PageTextRegionStepDebug: 172 page_image: Image = attrs.field(default=None) 173 precise_text_region_candidate_polygons: Sequence[Polygon] = attrs.field(default=None) 174 page_text_region_infos: Sequence[PageTextRegionInfo] = attrs.field(default=None) 175 flattened_text_regions: Sequence[FlattenedTextRegion] = attrs.field(default=None) 176 177 178@attrs.define 179class PageTextRegionStepOutput: 180 page_image: Image 181 page_char_polygons: Sequence[Polygon] 182 shape_before_rotate: Tuple[int, int] 183 rotate_angle: int 184 debug: Optional[PageTextRegionStepDebug] 185 186 187def calculate_boxed_masks_intersected_ratio( 188 anchor_mask: Mask, 189 candidate_mask: Mask, 190 use_candidate_as_base: bool = False, 191): 192 anchor_box = anchor_mask.box 193 assert anchor_box 194 195 candidate_box = candidate_mask.box 196 assert candidate_box 197 198 # Calculate intersection. 199 up = max(anchor_box.up, candidate_box.up) 200 down = min(anchor_box.down, candidate_box.down) 201 left = max(anchor_box.left, candidate_box.left) 202 right = min(anchor_box.right, candidate_box.right) 203 204 if up > down or left > right: 205 return 0.0 206 207 np_intersected_anchor_mask = anchor_mask.mat[ 208 up - anchor_box.up:down - anchor_box.up + 1, 209 left - anchor_box.left:right - anchor_box.left + 1, 210 ] # yapf: disable 211 np_intersected_candidate_mask = candidate_mask.mat[ 212 up - candidate_box.up:down - candidate_box.up + 1, 213 left - candidate_box.left:right - candidate_box.left + 1, 214 ] # yapf: disable 215 np_intersected_mask = np_intersected_anchor_mask & np_intersected_candidate_mask 216 intersected_area = int(np_intersected_mask.sum()) 217 218 if use_candidate_as_base: 219 base_area = int(candidate_mask.np_mask.sum()) 220 else: 221 base_area = ( 222 int(anchor_mask.np_mask.sum()) + int(candidate_mask.np_mask.sum()) - intersected_area 223 ) 224 225 return intersected_area / base_area 226 227 228class TextRegionFlattener: 229 230 @classmethod 231 def patch_text_region_polygons( 232 cls, 233 text_region_polygons: Sequence[Polygon], 234 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 235 ): 236 if grouped_char_polygons is None: 237 return text_region_polygons 238 239 assert len(text_region_polygons) == len(grouped_char_polygons) 240 241 patched_text_region_polygons: List[Polygon] = [] 242 for text_region_polygon, char_polygons in zip(text_region_polygons, grouped_char_polygons): 243 # Need to make sure all char polygons are included. 244 unionized_polygons = [text_region_polygon] 245 unionized_polygons.extend(char_polygons) 246 247 bounding_box = Box.from_boxes((polygon.bounding_box for polygon in unionized_polygons)) 248 mask = Mask.from_shapable(bounding_box).to_box_attached(bounding_box) 249 for polygon in unionized_polygons: 250 polygon.fill_mask(mask) 251 252 patched_text_region_polygons.append(mask.to_external_polygon()) 253 254 return patched_text_region_polygons 255 256 @classmethod 257 def process_text_region_polygons( 258 cls, 259 text_region_polygon_dilate_ratio: float, 260 shape: Tuple[int, int], 261 text_region_polygons: Sequence[Polygon], 262 force_no_dilation_flags: Optional[Sequence[bool]] = None, 263 ): 264 text_mask = Mask.from_polygons(shape, text_region_polygons) 265 non_text_mask = text_mask.to_inverted_mask() 266 267 box = Box.from_shape(shape) 268 text_mask = text_mask.to_box_attached(box) 269 non_text_mask = non_text_mask.to_box_attached(box) 270 271 bounding_extended_text_region_masks: List[Mask] = [] 272 bounding_rectangular_polygons: List[Polygon] = [] 273 274 if force_no_dilation_flags is None: 275 force_no_dilation_flags_iter = itertools.repeat(False) 276 else: 277 assert len(force_no_dilation_flags) == len(text_region_polygons) 278 force_no_dilation_flags_iter = force_no_dilation_flags 279 280 for text_region_polygon, force_no_dilation_flag in zip( 281 text_region_polygons, force_no_dilation_flags_iter 282 ): 283 original_text_region_polygon = text_region_polygon 284 285 if not force_no_dilation_flag: 286 # Dilate. 287 text_region_polygon = text_region_polygon.to_dilated_polygon( 288 ratio=text_region_polygon_dilate_ratio, 289 ) 290 text_region_polygon = text_region_polygon.to_clipped_polygon(shape) 291 292 # Get bounding rectangular box (polygon). 293 bounding_rectangular_polygon = \ 294 text_region_polygon.to_bounding_rectangular_polygon(shape) 295 296 bounding_box = bounding_rectangular_polygon.bounding_box 297 298 # Get other text region. 299 bounding_other_text_mask = bounding_rectangular_polygon.extract_mask(text_mask).copy() 300 # NOTE: Use the original text region polygon to unset the current text mask. 301 original_text_region_polygon.fill_mask(bounding_other_text_mask, 0) 302 303 # Get protentially dilated text region. 304 bounding_text_mask = Mask.from_shapable(bounding_other_text_mask) 305 bounding_text_mask = bounding_text_mask.to_box_attached(bounding_box) 306 # NOTE: Use the protentially dilated text region polygon to set the current text mask. 307 text_region_polygon.fill_mask(bounding_text_mask, value=1) 308 309 # Should not use the protentially dilated text region polygon anymore. 310 del text_region_polygon 311 312 # Trim protentially dilated text region polygon by eliminating other text region. 313 bounding_trimmed_text_mask = Mask.from_masks( 314 bounding_box, 315 [ 316 # Includes the protentially dilated text region. 317 bounding_text_mask, 318 # But not includes any other text regions. 319 bounding_other_text_mask.to_inverted_mask(), 320 ], 321 ElementSetOperationMode.INTERSECT, 322 ) 323 324 # Get non-text region. 325 bounding_non_text_mask = bounding_rectangular_polygon.extract_mask(non_text_mask) 326 327 # Combine trimmed text region and non-text region. 328 bounding_extended_text_region_mask = Mask.from_masks( 329 bounding_box, 330 [bounding_trimmed_text_mask, bounding_non_text_mask], 331 ) 332 333 bounding_extended_text_region_masks.append(bounding_extended_text_region_mask) 334 bounding_rectangular_polygons.append(bounding_rectangular_polygon) 335 336 return bounding_extended_text_region_masks, bounding_rectangular_polygons 337 338 @classmethod 339 def analyze_bounding_rectangular_polygons( 340 cls, 341 bounding_rectangular_polygons: Sequence[Polygon], 342 ): 343 long_side_ratios: List[float] = [] 344 long_side_angles: List[int] = [] 345 346 for polygon in bounding_rectangular_polygons: 347 # Get reference line. 348 point0, point1, _, point3 = polygon.points 349 side0_length = math.hypot( 350 point0.smooth_y - point1.smooth_y, 351 point0.smooth_x - point1.smooth_x, 352 ) 353 side1_length = math.hypot( 354 point0.smooth_y - point3.smooth_y, 355 point0.smooth_x - point3.smooth_x, 356 ) 357 358 long_side_ratios.append( 359 max(side0_length, side1_length) / min(side0_length, side1_length) 360 ) 361 362 point_a = point0 363 if side0_length > side1_length: 364 # Reference line (p0 -> p1). 365 point_b = point1 366 else: 367 # Reference line (p0 -> p3). 368 point_b = point3 369 370 # Get the angle of reference line, in [0, 180) degree. 371 np_theta = np.arctan2( 372 point_a.smooth_y - point_b.smooth_y, 373 point_a.smooth_x - point_b.smooth_x, 374 ) 375 np_theta = np_theta % np.pi 376 long_side_angle = round(np_theta / np.pi * 180) % 180 377 long_side_angles.append(long_side_angle) 378 379 return long_side_ratios, long_side_angles 380 381 @classmethod 382 def get_typical_angle( 383 cls, 384 typical_long_side_ratio_min: float, 385 long_side_ratios: Sequence[float], 386 long_side_angles: Sequence[int], 387 ): 388 typical_indices: Set[int] = set() 389 typical_long_side_angles: List[float] = [] 390 391 for idx, (long_side_ratio, long_side_angle) in \ 392 enumerate(zip(long_side_ratios, long_side_angles)): 393 if long_side_ratio < typical_long_side_ratio_min: 394 continue 395 396 typical_indices.add(idx) 397 typical_long_side_angles.append(long_side_angle) 398 399 if not typical_long_side_angles: 400 return None, typical_indices 401 402 # NOTE: Due to the sudden change between 179 and 0 degree, 403 # we need to normalize the range to [0, 360) before calculate the mean of angles. 404 two_pi = 2 * np.pi 405 np_angles = np.asarray(typical_long_side_angles) / 180 * two_pi 406 np_sin_mean = np.sin(np_angles).mean() 407 np_cos_mean = np.cos(np_angles).mean() 408 409 np_theta = np.arctan2(np_sin_mean, np_cos_mean) 410 np_theta = np_theta % two_pi 411 # Rescale the range back to [0, 180). 412 typical_angle = round(np_theta / two_pi * 180) 413 414 return typical_angle, typical_indices 415 416 @classmethod 417 def get_flattening_rotate_angles( 418 cls, 419 typical_angle: Optional[int], 420 typical_indices: Set[int], 421 long_side_angles: Sequence[int], 422 ): 423 if typical_angle is not None: 424 assert typical_indices 425 426 flattening_rotate_angles: List[int] = [] 427 428 for idx, long_side_angle in enumerate(long_side_angles): 429 if typical_angle is None or idx in typical_indices: 430 # Dominated by long_side_angle. 431 main_angle = long_side_angle 432 433 else: 434 # Dominated by typical_angle. 435 short_side_angle = (long_side_angle + 90) % 180 436 long_side_delta = abs((long_side_angle - typical_angle + 90) % 180 - 90) 437 short_side_delta = abs((short_side_angle - typical_angle + 90) % 180 - 90) 438 439 if long_side_delta < short_side_delta: 440 main_angle = long_side_angle 441 else: 442 main_angle = short_side_angle 443 444 # Angle for flattening. 445 if main_angle <= 90: 446 # [270, 360). 447 flattening_rotate_angle = (360 - main_angle) % 360 448 else: 449 # [1, 90). 450 flattening_rotate_angle = 180 - main_angle 451 flattening_rotate_angles.append(flattening_rotate_angle) 452 453 return flattening_rotate_angles 454 455 @classmethod 456 def build_flattened_text_regions( 457 cls, 458 image: Image, 459 text_region_polygons: Sequence[Polygon], 460 bounding_extended_text_region_masks: Sequence[Mask], 461 typical_indices: Set[int], 462 flattening_rotate_angles: Sequence[int], 463 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 464 ): 465 flattened_text_regions: List[FlattenedTextRegion] = [] 466 467 for idx, ( 468 text_region_polygon, 469 bounding_extended_text_region_mask, 470 flattening_rotate_angle, 471 ) in enumerate( 472 zip( 473 text_region_polygons, 474 bounding_extended_text_region_masks, 475 flattening_rotate_angles, 476 ) 477 ): 478 bounding_box = bounding_extended_text_region_mask.box 479 assert bounding_box 480 481 # Extract image. 482 text_region_image = bounding_extended_text_region_mask.extract_image(image) 483 484 # Shift char polygons. 485 relative_char_polygons = None 486 if grouped_char_polygons is not None: 487 char_polygons = grouped_char_polygons[idx] 488 relative_char_polygons = [ 489 char_polygon.to_relative_polygon( 490 origin_y=bounding_box.up, 491 origin_x=bounding_box.left, 492 ) for char_polygon in char_polygons 493 ] 494 495 # Rotate. 496 rotated_result = rotate.distort( 497 {'angle': flattening_rotate_angle}, 498 image=text_region_image, 499 mask=bounding_extended_text_region_mask, 500 polygons=relative_char_polygons, 501 ) 502 rotated_text_region_image = rotated_result.image 503 assert rotated_text_region_image 504 rotated_bounding_extended_text_region_mask = rotated_result.mask 505 assert rotated_bounding_extended_text_region_mask 506 # Could be None. 507 rotated_char_polygons = rotated_result.polygons 508 509 # Trim. 510 rotated_trimmed_box = rotated_bounding_extended_text_region_mask.to_external_box() 511 512 trimmed_text_region_image = rotated_text_region_image.to_cropped_image( 513 up=rotated_trimmed_box.up, 514 down=rotated_trimmed_box.down, 515 left=rotated_trimmed_box.left, 516 right=rotated_trimmed_box.right, 517 ) 518 519 trimmed_mask = rotated_trimmed_box.extract_mask( 520 rotated_bounding_extended_text_region_mask 521 ) 522 523 trimmed_char_polygons = None 524 if rotated_char_polygons: 525 trimmed_char_polygons = [ 526 rotated_char_polygon.to_relative_polygon( 527 origin_y=rotated_trimmed_box.up, 528 origin_x=rotated_trimmed_box.left, 529 ) for rotated_char_polygon in rotated_char_polygons 530 ] 531 532 flattened_text_regions.append( 533 FlattenedTextRegion( 534 is_typical=(idx in typical_indices), 535 text_region_polygon=text_region_polygon, 536 text_region_image=bounding_extended_text_region_mask.extract_image(image), 537 bounding_extended_text_region_mask=bounding_extended_text_region_mask, 538 flattening_rotate_angle=flattening_rotate_angle, 539 shape_before_trim=rotated_text_region_image.shape, 540 rotated_trimmed_box=rotated_trimmed_box, 541 shape_before_resize=trimmed_text_region_image.shape, 542 post_rotate_angle=0, 543 flattened_image=trimmed_text_region_image, 544 flattened_mask=trimmed_mask, 545 flattened_char_polygons=trimmed_char_polygons, 546 ) 547 ) 548 549 return flattened_text_regions 550 551 def __init__( 552 self, 553 typical_long_side_ratio_min: float, 554 text_region_polygon_dilate_ratio: float, 555 image: Image, 556 text_region_polygons: Sequence[Polygon], 557 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]] = None, 558 is_training: bool = False, 559 ): 560 self.origional_text_region_polygons = text_region_polygons 561 562 self.text_region_polygons = self.patch_text_region_polygons( 563 text_region_polygons=text_region_polygons, 564 grouped_char_polygons=grouped_char_polygons, 565 ) 566 567 force_no_dilation_flags = None 568 if is_training: 569 assert grouped_char_polygons and len(text_region_polygons) == len(grouped_char_polygons) 570 force_no_dilation_flags = [] 571 for char_polygons in grouped_char_polygons: 572 force_no_dilation_flags.append(not char_polygons) 573 574 self.bounding_extended_text_region_masks, self.bounding_rectangular_polygons = \ 575 self.process_text_region_polygons( 576 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 577 shape=image.shape, 578 text_region_polygons=self.text_region_polygons, 579 force_no_dilation_flags=force_no_dilation_flags, 580 ) 581 582 self.long_side_ratios, self.long_side_angles = \ 583 self.analyze_bounding_rectangular_polygons(self.bounding_rectangular_polygons) 584 585 self.typical_angle, self.typical_indices = self.get_typical_angle( 586 typical_long_side_ratio_min=typical_long_side_ratio_min, 587 long_side_ratios=self.long_side_ratios, 588 long_side_angles=self.long_side_angles, 589 ) 590 591 self.flattening_rotate_angles = self.get_flattening_rotate_angles( 592 typical_angle=self.typical_angle, 593 typical_indices=self.typical_indices, 594 long_side_angles=self.long_side_angles, 595 ) 596 597 self.flattened_text_regions = self.build_flattened_text_regions( 598 image=image, 599 text_region_polygons=self.origional_text_region_polygons, 600 bounding_extended_text_region_masks=self.bounding_extended_text_region_masks, 601 typical_indices=self.typical_indices, 602 flattening_rotate_angles=self.flattening_rotate_angles, 603 grouped_char_polygons=grouped_char_polygons, 604 ) 605 606 607def build_background_image_for_stacking(height: int, width: int): 608 np_rgb_rows = [np.zeros((width, 3), dtype=np.uint8) for _ in range(3)] 609 rgb_tuples = [(255, 0, 0), (0, 255, 0), (0, 0, 255)] 610 611 for color_offset, np_row in enumerate(np_rgb_rows): 612 for color_idx in range(3): 613 color_tuple = rgb_tuples[(color_offset + color_idx) % 3] 614 np_row[color_idx::3] = color_tuple 615 616 np_image = np.zeros((height, width, 3), dtype=np.uint8) 617 for row_offset, np_row in enumerate(np_rgb_rows): 618 np_image[row_offset::3] = np_row 619 620 return Image(mat=np_image) 621 622 623def stack_flattened_text_regions( 624 page_pad: int, 625 flattened_text_regions_pad: int, 626 flattened_text_regions: Sequence[FlattenedTextRegion], 627): 628 page_double_pad = 2 * page_pad 629 flattened_text_regions_double_pad = 2 * flattened_text_regions_pad 630 631 rect_packer = RectPacker(rotation=False) 632 633 # Add box and bin. 634 # NOTE: Only one bin is added, that is, packing all text region into one image. 635 bin_width = 0 636 bin_height = 0 637 638 for ftr_idx, flattened_text_region in enumerate(flattened_text_regions): 639 rect_packer.add_rect( 640 width=flattened_text_region.width + flattened_text_regions_double_pad, 641 height=flattened_text_region.height + flattened_text_regions_double_pad, 642 rid=ftr_idx, 643 ) 644 645 bin_width = max(bin_width, flattened_text_region.width) 646 bin_height += flattened_text_region.height 647 648 bin_width += flattened_text_regions_double_pad 649 bin_height += flattened_text_regions_double_pad 650 651 rect_packer.add_bin(width=bin_width, height=bin_height) 652 653 # Pack boxes. 654 rect_packer.pack() # type: ignore 655 656 # Get packed boxes. 657 unordered_boxes: List[Box] = [] 658 ftr_indices: List[int] = [] 659 for bin_idx, x, y, width, height, ftr_idx in rect_packer.rect_list(): 660 assert bin_idx == 0 661 unordered_boxes.append(Box( 662 up=y, 663 down=y + height - 1, 664 left=x, 665 right=x + width - 1, 666 )) 667 ftr_indices.append(ftr_idx) 668 669 # Order boxes. 670 inverse_ftr_indices = [-1] * len(ftr_indices) 671 for inverse_ftr_idx, ftr_idx in enumerate(ftr_indices): 672 inverse_ftr_indices[ftr_idx] = inverse_ftr_idx 673 for inverse_ftr_idx in inverse_ftr_indices: 674 assert inverse_ftr_idx >= 0 675 padded_boxes = [unordered_boxes[inverse_ftr_idx] for inverse_ftr_idx in inverse_ftr_indices] 676 677 page_height = max(box.down for box in padded_boxes) + 1 + page_double_pad 678 page_width = max(box.right for box in padded_boxes) + 1 + page_double_pad 679 680 image = build_background_image_for_stacking(page_height, page_width) 681 boxes: List[Box] = [] 682 char_polygons: List[Polygon] = [] 683 684 for padded_box, flattened_text_region in zip(padded_boxes, flattened_text_regions): 685 assert flattened_text_region.height + flattened_text_regions_double_pad \ 686 == padded_box.height 687 assert flattened_text_region.width + flattened_text_regions_double_pad \ 688 == padded_box.width 689 690 # Remove box padding. 691 up = padded_box.up + flattened_text_regions_pad + page_pad 692 left = padded_box.left + flattened_text_regions_pad + page_pad 693 694 box = Box( 695 up=up, 696 down=up + flattened_text_region.height - 1, 697 left=left, 698 right=left + flattened_text_region.width - 1, 699 ) 700 boxes.append(box) 701 702 # Render. 703 box.fill_image( 704 image, 705 flattened_text_region.flattened_image, 706 image_mask=flattened_text_region.flattened_mask, 707 ) 708 709 if flattened_text_region.flattened_char_polygons: 710 for char_polygon in flattened_text_region.flattened_char_polygons: 711 char_polygons.append(char_polygon.to_shifted_polygon( 712 offset_y=up, 713 offset_x=left, 714 )) 715 716 return image, boxes, char_polygons 717 718 719class PageTextRegionStep( 720 PipelineStep[ 721 PageTextRegionStepConfig, 722 PageTextRegionStepInput, 723 PageTextRegionStepOutput, 724 ] 725): # yapf: disable 726 727 @classmethod 728 def generate_precise_text_region_candidate_polygons( 729 cls, 730 precise_mask: Mask, 731 disconnected_text_region_mask: Mask, 732 ): 733 assert precise_mask.box and disconnected_text_region_mask.box 734 735 # Get the intersection. 736 intersected_box = Box( 737 up=max(precise_mask.box.up, disconnected_text_region_mask.box.up), 738 down=min(precise_mask.box.down, disconnected_text_region_mask.box.down), 739 left=max(precise_mask.box.left, disconnected_text_region_mask.box.left), 740 right=min(precise_mask.box.right, disconnected_text_region_mask.box.right), 741 ) 742 assert intersected_box.up <= intersected_box.down 743 assert intersected_box.left <= intersected_box.right 744 745 precise_mask = intersected_box.extract_mask(precise_mask) 746 disconnected_text_region_mask = intersected_box.extract_mask(disconnected_text_region_mask) 747 748 # Apply mask bitwise-and operation. 749 intersected_mask = Mask( 750 mat=(disconnected_text_region_mask.mat & precise_mask.mat).astype(np.uint8) 751 ) 752 intersected_mask = intersected_mask.to_box_attached(intersected_box) 753 754 # NOTE: 755 # 1. Could extract more than one polygons. 756 # 2. Some polygons are in border and should be removed later. 757 return intersected_mask.to_disconnected_polygons() 758 759 @classmethod 760 def strtree_query_intersected_polygons( 761 cls, 762 strtree: STRtree, 763 id_to_anchor_polygon: Dict[int, Polygon], 764 candidate_polygon: Polygon, 765 ): 766 candidate_shapely_polygon = candidate_polygon.to_shapely_polygon() 767 candidate_mask = candidate_polygon.mask 768 769 for anchor_shapely_polygon in strtree.query(candidate_shapely_polygon): 770 anchor_id = id(anchor_shapely_polygon) 771 anchor_polygon = id_to_anchor_polygon[anchor_id] 772 anchor_mask = anchor_polygon.mask 773 774 intersected_ratio = calculate_boxed_masks_intersected_ratio( 775 anchor_mask=anchor_mask, 776 candidate_mask=candidate_mask, 777 use_candidate_as_base=True, 778 ) 779 780 yield ( 781 anchor_id, 782 anchor_polygon, 783 anchor_mask, 784 candidate_mask, 785 intersected_ratio, 786 ) 787 788 def sample_page_non_text_region_polygons( 789 self, 790 page_non_text_region_polygons: Sequence[Polygon], 791 num_page_text_region_infos: int, 792 rng: RandomGenerator, 793 ): 794 negative_ratio = self.config.negative_text_region_ratio 795 num_page_non_text_region_polygons = round( 796 negative_ratio * num_page_text_region_infos / (1 - negative_ratio) 797 ) 798 return rng_choice_with_size( 799 rng, 800 page_non_text_region_polygons, 801 size=min( 802 num_page_non_text_region_polygons, 803 len(page_non_text_region_polygons), 804 ), 805 replace=False, 806 ) 807 808 def build_flattened_text_regions( 809 self, 810 page_image: Image, 811 page_text_region_infos: Sequence[PageTextRegionInfo], 812 page_non_text_region_polygons: Sequence[Polygon], 813 rng: RandomGenerator, 814 ): 815 text_region_polygon_dilate_ratio = float( 816 rng.uniform( 817 self.config.text_region_flattener_text_region_polygon_dilate_ratio_min, 818 self.config.text_region_flattener_text_region_polygon_dilate_ratio_max, 819 ) 820 ) 821 typical_long_side_ratio_min = \ 822 self.config.text_region_flattener_typical_long_side_ratio_min 823 824 text_region_polygons: List[Polygon] = [] 825 grouped_char_polygons: List[Sequence[Polygon]] = [] 826 for page_text_region_info in page_text_region_infos: 827 text_region_polygons.append(page_text_region_info.precise_text_region_polygon) 828 grouped_char_polygons.append(page_text_region_info.char_polygons) 829 830 # Inject nagative regions. 831 for page_non_text_region_polygon in page_non_text_region_polygons: 832 text_region_polygons.append(page_non_text_region_polygon) 833 grouped_char_polygons.append(tuple()) 834 835 text_region_flattener = TextRegionFlattener( 836 typical_long_side_ratio_min=typical_long_side_ratio_min, 837 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 838 image=page_image, 839 text_region_polygons=text_region_polygons, 840 grouped_char_polygons=grouped_char_polygons, 841 is_training=True, 842 ) 843 844 # Resize positive ftr. 845 positive_flattened_text_regions: List[FlattenedTextRegion] = [] 846 # For negative sampling. 847 positive_reference_heights: List[float] = [] 848 positive_reference_widths: List[float] = [] 849 num_negative_flattened_text_regions = 0 850 851 for flattened_text_region in text_region_flattener.flattened_text_regions: 852 if not flattened_text_region.flattened_char_polygons: 853 num_negative_flattened_text_regions += 1 854 continue 855 856 char_height_median = flattened_text_region.get_char_height_meidan() 857 858 text_region_resize_char_height_median = int( 859 rng.integers( 860 self.config.text_region_resize_char_height_median_min, 861 self.config.text_region_resize_char_height_median_max + 1, 862 ) 863 ) 864 scale = text_region_resize_char_height_median / char_height_median 865 866 height, width = flattened_text_region.shape 867 resized_height = round(height * scale) 868 resized_width = round(width * scale) 869 870 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 871 resized_height=resized_height, 872 resized_width=resized_width, 873 ) 874 875 positive_reference_heights.append(resized_height) 876 positive_reference_widths.append(resized_width) 877 878 # Post rotate. 879 post_rotate_angle = 0 880 if flattened_text_region.is_typical: 881 if rng.random() < self.config.text_region_typical_post_rotate_prob: 882 # Upside down only. 883 post_rotate_angle = 180 884 else: 885 if rng.random() < self.config.text_region_untypical_post_rotate_prob: 886 # 3-way rotate. 887 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 888 889 if post_rotate_angle != 0: 890 flattened_text_region = \ 891 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 892 893 positive_flattened_text_regions.append(flattened_text_region) 894 895 # Resize negative ftr. 896 negative_reference_heights = list( 897 rng_choice_with_size( 898 rng, 899 positive_reference_heights, 900 size=num_negative_flattened_text_regions, 901 replace=(num_negative_flattened_text_regions > len(positive_reference_heights)), 902 ) 903 ) 904 905 negative_height_max = max(positive_reference_heights) 906 negative_width_max = max(positive_reference_widths) 907 908 negative_flattened_text_regions: List[FlattenedTextRegion] = [] 909 910 for flattened_text_region in text_region_flattener.flattened_text_regions: 911 if flattened_text_region.flattened_char_polygons: 912 continue 913 914 reference_height = negative_reference_heights.pop() 915 scale = reference_height / flattened_text_region.height 916 917 height, width = flattened_text_region.shape 918 resized_height = round(height * scale) 919 resized_width = round(width * scale) 920 921 # Remove negative region that is too large. 922 if resized_height > negative_height_max or resized_width > negative_width_max: 923 continue 924 925 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 926 resized_height=resized_height, 927 resized_width=resized_width, 928 ) 929 930 # Post rotate. 931 post_rotate_angle = 0 932 if flattened_text_region.is_typical: 933 if rng.random() < self.config.text_region_typical_post_rotate_prob: 934 # Upside down only. 935 post_rotate_angle = 180 936 else: 937 if rng.random() < self.config.text_region_untypical_post_rotate_prob: 938 # 3-way rotate. 939 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 940 941 if post_rotate_angle != 0: 942 flattened_text_region = \ 943 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 944 945 negative_flattened_text_regions.append(flattened_text_region) 946 947 flattened_text_regions = ( 948 *positive_flattened_text_regions, 949 *negative_flattened_text_regions, 950 ) 951 return flattened_text_regions 952 953 def run(self, input: PageTextRegionStepInput, rng: RandomGenerator): 954 page_distortion_step_output = input.page_distortion_step_output 955 page_image = page_distortion_step_output.page_image 956 page_char_polygon_collection = page_distortion_step_output.page_char_polygon_collection 957 page_disconnected_text_region_collection = \ 958 page_distortion_step_output.page_disconnected_text_region_collection 959 page_non_text_region_collection = \ 960 page_distortion_step_output.page_non_text_region_collection 961 962 page_resizing_step_output = input.page_resizing_step_output 963 page_resized_text_line_mask = page_resizing_step_output.page_text_line_mask 964 965 debug = None 966 if self.config.enable_debug: 967 debug = PageTextRegionStepDebug() 968 969 # Build R-tree to track text regions. 970 # https://github.com/shapely/shapely/issues/640 971 id_to_disconnected_text_region_polygon: Dict[int, Polygon] = {} 972 disconnected_text_region_shapely_polygons: List[ShapelyPolygon] = [] 973 974 for polygon in page_disconnected_text_region_collection.to_polygons(): 975 shapely_polygon = polygon.to_shapely_polygon() 976 id_to_disconnected_text_region_polygon[id(shapely_polygon)] = polygon 977 disconnected_text_region_shapely_polygons.append(shapely_polygon) 978 979 disconnected_text_region_tree = STRtree(disconnected_text_region_shapely_polygons) 980 981 # Get the precise text regions. 982 precise_text_region_candidate_polygons: List[Polygon] = [] 983 for resized_precise_polygon in page_resized_text_line_mask.to_disconnected_polygons(): 984 # Resize back to the shape after distortion. 985 precise_polygon = resized_precise_polygon.to_conducted_resized_polygon( 986 page_resized_text_line_mask, 987 resized_height=page_image.height, 988 resized_width=page_image.width, 989 ) 990 991 # Find and extract intersected text region. 992 # NOTE: One precise_polygon could be overlapped with 993 # more than one disconnected_text_region_polygon! 994 for _, _, disconnected_text_region_mask, precise_mask, _ in \ 995 self.strtree_query_intersected_polygons( 996 strtree=disconnected_text_region_tree, 997 id_to_anchor_polygon=id_to_disconnected_text_region_polygon, 998 candidate_polygon=precise_polygon, 999 ): 1000 precise_text_region_candidate_polygons.extend( 1001 self.generate_precise_text_region_candidate_polygons( 1002 precise_mask=precise_mask, 1003 disconnected_text_region_mask=disconnected_text_region_mask, 1004 ) 1005 ) 1006 1007 if debug: 1008 debug.page_image = page_image 1009 debug.precise_text_region_candidate_polygons = precise_text_region_candidate_polygons 1010 1011 # Help gc. 1012 del id_to_disconnected_text_region_polygon 1013 del disconnected_text_region_shapely_polygons 1014 del disconnected_text_region_tree 1015 1016 # Bind char-level polygon to precise text region. 1017 id_to_precise_text_region_polygon: Dict[int, Polygon] = {} 1018 precise_text_region_shapely_polygons: List[ShapelyPolygon] = [] 1019 1020 for polygon in precise_text_region_candidate_polygons: 1021 shapely_polygon = polygon.to_shapely_polygon() 1022 id_to_precise_text_region_polygon[id(shapely_polygon)] = polygon 1023 precise_text_region_shapely_polygons.append(shapely_polygon) 1024 1025 precise_text_region_tree = STRtree(precise_text_region_shapely_polygons) 1026 1027 id_to_char_polygons: DefaultDict[int, List[Polygon]] = defaultdict(list) 1028 for char_polygon in page_char_polygon_collection.polygons: 1029 best_precise_text_region_id = None 1030 intersected_ratio_max = 0 1031 1032 for ( 1033 precise_text_region_id, 1034 _, 1035 _, 1036 _, 1037 intersected_ratio, 1038 ) in self.strtree_query_intersected_polygons( 1039 strtree=precise_text_region_tree, 1040 id_to_anchor_polygon=id_to_precise_text_region_polygon, 1041 candidate_polygon=char_polygon, 1042 ): 1043 if intersected_ratio > intersected_ratio_max: 1044 intersected_ratio_max = intersected_ratio 1045 best_precise_text_region_id = precise_text_region_id 1046 1047 if best_precise_text_region_id is not None: 1048 id_to_char_polygons[best_precise_text_region_id].append(char_polygon) 1049 else: 1050 # NOTE: Text line with only a small char (i.e. delimiter) could enter this branch. 1051 # In such case, the text line bounding box is smaller than the char polygon, since 1052 # the leading/trailing char paddings are ignored during text line rendering. 1053 # It's acceptable for now since: 1) this case happens rarely, 2) and it won't 1054 # introduce labeling noise. 1055 logger.warning(f'Cannot assign a text region for char_polygon={char_polygon}') 1056 1057 page_text_region_infos: List[PageTextRegionInfo] = [] 1058 for precise_text_region_shapely_polygon in precise_text_region_shapely_polygons: 1059 ptrsp_id = id(precise_text_region_shapely_polygon) 1060 if ptrsp_id not in id_to_char_polygons: 1061 # Not related to any char polygons. 1062 continue 1063 assert id_to_char_polygons[ptrsp_id] 1064 page_text_region_infos.append( 1065 PageTextRegionInfo( 1066 precise_text_region_polygon=id_to_precise_text_region_polygon[ptrsp_id], 1067 char_polygons=id_to_char_polygons[ptrsp_id], 1068 ) 1069 ) 1070 1071 # Help gc. 1072 del id_to_precise_text_region_polygon 1073 del precise_text_region_shapely_polygons 1074 del precise_text_region_tree 1075 1076 if debug: 1077 debug.page_text_region_infos = page_text_region_infos 1078 1079 # Negative sampling. 1080 page_non_text_region_polygons = self.sample_page_non_text_region_polygons( 1081 page_non_text_region_polygons=tuple(page_non_text_region_collection.to_polygons()), 1082 num_page_text_region_infos=len(page_text_region_infos), 1083 rng=rng, 1084 ) 1085 1086 flattened_text_regions = self.build_flattened_text_regions( 1087 page_image=page_image, 1088 page_text_region_infos=page_text_region_infos, 1089 page_non_text_region_polygons=page_non_text_region_polygons, 1090 rng=rng, 1091 ) 1092 if debug: 1093 debug.flattened_text_regions = flattened_text_regions 1094 1095 # Stack text regions. 1096 image, _, char_polygons = stack_flattened_text_regions( 1097 page_pad=0, 1098 flattened_text_regions_pad=self.config.stack_flattened_text_regions_pad, 1099 flattened_text_regions=flattened_text_regions, 1100 ) 1101 1102 # Post uniform rotation. 1103 shape_before_rotate = image.shape 1104 rotate_angle = 0 1105 1106 if self.config.enable_post_rotate: 1107 rotate_angle = int( 1108 rng.integers( 1109 self.config.post_rotate_angle_min, 1110 self.config.post_rotate_angle_max + 1, 1111 ) 1112 ) 1113 rotated_result = rotate.distort( 1114 {'angle': rotate_angle}, 1115 image=image, 1116 polygons=char_polygons, 1117 ) 1118 assert rotated_result.image and rotated_result.polygons 1119 image = rotated_result.image 1120 char_polygons = rotated_result.polygons 1121 1122 return PageTextRegionStepOutput( 1123 page_image=image, 1124 page_char_polygons=char_polygons, 1125 shape_before_rotate=shape_before_rotate, 1126 rotate_angle=rotate_angle, 1127 debug=debug, 1128 ) 1129 1130 1131page_text_region_step_factory = PipelineStepFactory(PageTextRegionStep)
45class PageTextRegionStepConfig: 46 text_region_flattener_typical_long_side_ratio_min: float = 3.0 47 text_region_flattener_text_region_polygon_dilate_ratio_min: float = 0.85 48 text_region_flattener_text_region_polygon_dilate_ratio_max: float = 1.0 49 text_region_resize_char_height_median_min: int = 30 50 text_region_resize_char_height_median_max: int = 45 51 text_region_typical_post_rotate_prob: float = 0.2 52 text_region_untypical_post_rotate_prob: float = 0.2 53 negative_text_region_ratio: float = 0.1 54 negative_text_region_post_rotate_prob: float = 0.2 55 stack_flattened_text_regions_pad: int = 2 56 enable_post_rotate: bool = False 57 post_rotate_angle_min: int = -10 58 post_rotate_angle_max: int = 10 59 enable_debug: bool = False
2def __init__(self, text_region_flattener_typical_long_side_ratio_min=attr_dict['text_region_flattener_typical_long_side_ratio_min'].default, text_region_flattener_text_region_polygon_dilate_ratio_min=attr_dict['text_region_flattener_text_region_polygon_dilate_ratio_min'].default, text_region_flattener_text_region_polygon_dilate_ratio_max=attr_dict['text_region_flattener_text_region_polygon_dilate_ratio_max'].default, text_region_resize_char_height_median_min=attr_dict['text_region_resize_char_height_median_min'].default, text_region_resize_char_height_median_max=attr_dict['text_region_resize_char_height_median_max'].default, text_region_typical_post_rotate_prob=attr_dict['text_region_typical_post_rotate_prob'].default, text_region_untypical_post_rotate_prob=attr_dict['text_region_untypical_post_rotate_prob'].default, negative_text_region_ratio=attr_dict['negative_text_region_ratio'].default, negative_text_region_post_rotate_prob=attr_dict['negative_text_region_post_rotate_prob'].default, stack_flattened_text_regions_pad=attr_dict['stack_flattened_text_regions_pad'].default, enable_post_rotate=attr_dict['enable_post_rotate'].default, post_rotate_angle_min=attr_dict['post_rotate_angle_min'].default, post_rotate_angle_max=attr_dict['post_rotate_angle_max'].default, enable_debug=attr_dict['enable_debug'].default): 3 self.text_region_flattener_typical_long_side_ratio_min = text_region_flattener_typical_long_side_ratio_min 4 self.text_region_flattener_text_region_polygon_dilate_ratio_min = text_region_flattener_text_region_polygon_dilate_ratio_min 5 self.text_region_flattener_text_region_polygon_dilate_ratio_max = text_region_flattener_text_region_polygon_dilate_ratio_max 6 self.text_region_resize_char_height_median_min = text_region_resize_char_height_median_min 7 self.text_region_resize_char_height_median_max = text_region_resize_char_height_median_max 8 self.text_region_typical_post_rotate_prob = text_region_typical_post_rotate_prob 9 self.text_region_untypical_post_rotate_prob = text_region_untypical_post_rotate_prob 10 self.negative_text_region_ratio = negative_text_region_ratio 11 self.negative_text_region_post_rotate_prob = negative_text_region_post_rotate_prob 12 self.stack_flattened_text_regions_pad = stack_flattened_text_regions_pad 13 self.enable_post_rotate = enable_post_rotate 14 self.post_rotate_angle_min = post_rotate_angle_min 15 self.post_rotate_angle_max = post_rotate_angle_max 16 self.enable_debug = enable_debug
Method generated by attrs for class PageTextRegionStepConfig.
63class PageTextRegionStepInput: 64 page_distortion_step_output: PageDistortionStepOutput 65 page_resizing_step_output: PageResizingStepOutput
2def __init__(self, page_distortion_step_output, page_resizing_step_output): 3 self.page_distortion_step_output = page_distortion_step_output 4 self.page_resizing_step_output = page_resizing_step_output
Method generated by attrs for class PageTextRegionStepInput.
69class PageTextRegionInfo: 70 precise_text_region_polygon: Polygon 71 char_polygons: Sequence[Polygon]
2def __init__(self, precise_text_region_polygon, char_polygons): 3 self.precise_text_region_polygon = precise_text_region_polygon 4 self.char_polygons = char_polygons
Method generated by attrs for class PageTextRegionInfo.
75class FlattenedTextRegion: 76 is_typical: bool 77 text_region_polygon: Polygon 78 text_region_image: Image 79 bounding_extended_text_region_mask: Mask 80 flattening_rotate_angle: int 81 shape_before_trim: Tuple[int, int] 82 rotated_trimmed_box: Box 83 shape_before_resize: Tuple[int, int] 84 post_rotate_angle: int 85 flattened_image: Image 86 flattened_mask: Mask 87 flattened_char_polygons: Optional[Sequence[Polygon]] 88 89 @property 90 def shape(self): 91 return self.flattened_image.shape 92 93 @property 94 def height(self): 95 return self.flattened_image.height 96 97 @property 98 def width(self): 99 return self.flattened_image.width 100 101 @property 102 def area(self): 103 return self.flattened_image.area 104 105 def get_char_height_meidan(self): 106 assert self.flattened_char_polygons 107 return statistics.median( 108 char_polygon.get_rectangular_height() for char_polygon in self.flattened_char_polygons 109 ) 110 111 def to_resized_flattened_text_region( 112 self, 113 resized_height: Optional[int] = None, 114 resized_width: Optional[int] = None, 115 ): 116 resized_flattened_image = self.flattened_image.to_resized_image( 117 resized_height=resized_height, 118 resized_width=resized_width, 119 ) 120 121 resized_flattened_mask = self.flattened_mask.to_resized_mask( 122 resized_height=resized_height, 123 resized_width=resized_width, 124 ) 125 126 resized_flattened_char_polygons = None 127 if self.flattened_char_polygons is not None: 128 resized_flattened_char_polygons = [ 129 flattened_char_polygon.to_conducted_resized_polygon( 130 self.shape, 131 resized_height=resized_height, 132 resized_width=resized_width, 133 ) for flattened_char_polygon in self.flattened_char_polygons 134 ] 135 136 return attrs.evolve( 137 self, 138 flattened_image=resized_flattened_image, 139 flattened_mask=resized_flattened_mask, 140 flattened_char_polygons=resized_flattened_char_polygons, 141 ) 142 143 def to_post_rotated_flattened_text_region( 144 self, 145 post_rotate_angle: int, 146 ): 147 assert self.post_rotate_angle == 0 148 149 # NOTE: No need to trim. 150 rotated_result = rotate.distort( 151 {'angle': post_rotate_angle}, 152 image=self.flattened_image, 153 mask=self.flattened_mask, 154 polygons=self.flattened_char_polygons, 155 ) 156 rotated_flattened_image = rotated_result.image 157 assert rotated_flattened_image 158 rotated_flattened_mask = rotated_result.mask 159 assert rotated_flattened_mask 160 rotated_flattened_char_polygons = rotated_result.polygons 161 162 return attrs.evolve( 163 self, 164 post_rotate_angle=post_rotate_angle, 165 flattened_image=rotated_flattened_image, 166 flattened_mask=rotated_flattened_mask, 167 flattened_char_polygons=rotated_flattened_char_polygons, 168 )
2def __init__(self, is_typical, text_region_polygon, text_region_image, bounding_extended_text_region_mask, flattening_rotate_angle, shape_before_trim, rotated_trimmed_box, shape_before_resize, post_rotate_angle, flattened_image, flattened_mask, flattened_char_polygons): 3 self.is_typical = is_typical 4 self.text_region_polygon = text_region_polygon 5 self.text_region_image = text_region_image 6 self.bounding_extended_text_region_mask = bounding_extended_text_region_mask 7 self.flattening_rotate_angle = flattening_rotate_angle 8 self.shape_before_trim = shape_before_trim 9 self.rotated_trimmed_box = rotated_trimmed_box 10 self.shape_before_resize = shape_before_resize 11 self.post_rotate_angle = post_rotate_angle 12 self.flattened_image = flattened_image 13 self.flattened_mask = flattened_mask 14 self.flattened_char_polygons = flattened_char_polygons
Method generated by attrs for class FlattenedTextRegion.
111 def to_resized_flattened_text_region( 112 self, 113 resized_height: Optional[int] = None, 114 resized_width: Optional[int] = None, 115 ): 116 resized_flattened_image = self.flattened_image.to_resized_image( 117 resized_height=resized_height, 118 resized_width=resized_width, 119 ) 120 121 resized_flattened_mask = self.flattened_mask.to_resized_mask( 122 resized_height=resized_height, 123 resized_width=resized_width, 124 ) 125 126 resized_flattened_char_polygons = None 127 if self.flattened_char_polygons is not None: 128 resized_flattened_char_polygons = [ 129 flattened_char_polygon.to_conducted_resized_polygon( 130 self.shape, 131 resized_height=resized_height, 132 resized_width=resized_width, 133 ) for flattened_char_polygon in self.flattened_char_polygons 134 ] 135 136 return attrs.evolve( 137 self, 138 flattened_image=resized_flattened_image, 139 flattened_mask=resized_flattened_mask, 140 flattened_char_polygons=resized_flattened_char_polygons, 141 )
143 def to_post_rotated_flattened_text_region( 144 self, 145 post_rotate_angle: int, 146 ): 147 assert self.post_rotate_angle == 0 148 149 # NOTE: No need to trim. 150 rotated_result = rotate.distort( 151 {'angle': post_rotate_angle}, 152 image=self.flattened_image, 153 mask=self.flattened_mask, 154 polygons=self.flattened_char_polygons, 155 ) 156 rotated_flattened_image = rotated_result.image 157 assert rotated_flattened_image 158 rotated_flattened_mask = rotated_result.mask 159 assert rotated_flattened_mask 160 rotated_flattened_char_polygons = rotated_result.polygons 161 162 return attrs.evolve( 163 self, 164 post_rotate_angle=post_rotate_angle, 165 flattened_image=rotated_flattened_image, 166 flattened_mask=rotated_flattened_mask, 167 flattened_char_polygons=rotated_flattened_char_polygons, 168 )
172class PageTextRegionStepDebug: 173 page_image: Image = attrs.field(default=None) 174 precise_text_region_candidate_polygons: Sequence[Polygon] = attrs.field(default=None) 175 page_text_region_infos: Sequence[PageTextRegionInfo] = attrs.field(default=None) 176 flattened_text_regions: Sequence[FlattenedTextRegion] = attrs.field(default=None)
2def __init__(self, page_image=attr_dict['page_image'].default, precise_text_region_candidate_polygons=attr_dict['precise_text_region_candidate_polygons'].default, page_text_region_infos=attr_dict['page_text_region_infos'].default, flattened_text_regions=attr_dict['flattened_text_regions'].default): 3 self.page_image = page_image 4 self.precise_text_region_candidate_polygons = precise_text_region_candidate_polygons 5 self.page_text_region_infos = page_text_region_infos 6 self.flattened_text_regions = flattened_text_regions
Method generated by attrs for class PageTextRegionStepDebug.
180class PageTextRegionStepOutput: 181 page_image: Image 182 page_char_polygons: Sequence[Polygon] 183 shape_before_rotate: Tuple[int, int] 184 rotate_angle: int 185 debug: Optional[PageTextRegionStepDebug]
2def __init__(self, page_image, page_char_polygons, shape_before_rotate, rotate_angle, debug): 3 self.page_image = page_image 4 self.page_char_polygons = page_char_polygons 5 self.shape_before_rotate = shape_before_rotate 6 self.rotate_angle = rotate_angle 7 self.debug = debug
Method generated by attrs for class PageTextRegionStepOutput.
188def calculate_boxed_masks_intersected_ratio( 189 anchor_mask: Mask, 190 candidate_mask: Mask, 191 use_candidate_as_base: bool = False, 192): 193 anchor_box = anchor_mask.box 194 assert anchor_box 195 196 candidate_box = candidate_mask.box 197 assert candidate_box 198 199 # Calculate intersection. 200 up = max(anchor_box.up, candidate_box.up) 201 down = min(anchor_box.down, candidate_box.down) 202 left = max(anchor_box.left, candidate_box.left) 203 right = min(anchor_box.right, candidate_box.right) 204 205 if up > down or left > right: 206 return 0.0 207 208 np_intersected_anchor_mask = anchor_mask.mat[ 209 up - anchor_box.up:down - anchor_box.up + 1, 210 left - anchor_box.left:right - anchor_box.left + 1, 211 ] # yapf: disable 212 np_intersected_candidate_mask = candidate_mask.mat[ 213 up - candidate_box.up:down - candidate_box.up + 1, 214 left - candidate_box.left:right - candidate_box.left + 1, 215 ] # yapf: disable 216 np_intersected_mask = np_intersected_anchor_mask & np_intersected_candidate_mask 217 intersected_area = int(np_intersected_mask.sum()) 218 219 if use_candidate_as_base: 220 base_area = int(candidate_mask.np_mask.sum()) 221 else: 222 base_area = ( 223 int(anchor_mask.np_mask.sum()) + int(candidate_mask.np_mask.sum()) - intersected_area 224 ) 225 226 return intersected_area / base_area
229class TextRegionFlattener: 230 231 @classmethod 232 def patch_text_region_polygons( 233 cls, 234 text_region_polygons: Sequence[Polygon], 235 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 236 ): 237 if grouped_char_polygons is None: 238 return text_region_polygons 239 240 assert len(text_region_polygons) == len(grouped_char_polygons) 241 242 patched_text_region_polygons: List[Polygon] = [] 243 for text_region_polygon, char_polygons in zip(text_region_polygons, grouped_char_polygons): 244 # Need to make sure all char polygons are included. 245 unionized_polygons = [text_region_polygon] 246 unionized_polygons.extend(char_polygons) 247 248 bounding_box = Box.from_boxes((polygon.bounding_box for polygon in unionized_polygons)) 249 mask = Mask.from_shapable(bounding_box).to_box_attached(bounding_box) 250 for polygon in unionized_polygons: 251 polygon.fill_mask(mask) 252 253 patched_text_region_polygons.append(mask.to_external_polygon()) 254 255 return patched_text_region_polygons 256 257 @classmethod 258 def process_text_region_polygons( 259 cls, 260 text_region_polygon_dilate_ratio: float, 261 shape: Tuple[int, int], 262 text_region_polygons: Sequence[Polygon], 263 force_no_dilation_flags: Optional[Sequence[bool]] = None, 264 ): 265 text_mask = Mask.from_polygons(shape, text_region_polygons) 266 non_text_mask = text_mask.to_inverted_mask() 267 268 box = Box.from_shape(shape) 269 text_mask = text_mask.to_box_attached(box) 270 non_text_mask = non_text_mask.to_box_attached(box) 271 272 bounding_extended_text_region_masks: List[Mask] = [] 273 bounding_rectangular_polygons: List[Polygon] = [] 274 275 if force_no_dilation_flags is None: 276 force_no_dilation_flags_iter = itertools.repeat(False) 277 else: 278 assert len(force_no_dilation_flags) == len(text_region_polygons) 279 force_no_dilation_flags_iter = force_no_dilation_flags 280 281 for text_region_polygon, force_no_dilation_flag in zip( 282 text_region_polygons, force_no_dilation_flags_iter 283 ): 284 original_text_region_polygon = text_region_polygon 285 286 if not force_no_dilation_flag: 287 # Dilate. 288 text_region_polygon = text_region_polygon.to_dilated_polygon( 289 ratio=text_region_polygon_dilate_ratio, 290 ) 291 text_region_polygon = text_region_polygon.to_clipped_polygon(shape) 292 293 # Get bounding rectangular box (polygon). 294 bounding_rectangular_polygon = \ 295 text_region_polygon.to_bounding_rectangular_polygon(shape) 296 297 bounding_box = bounding_rectangular_polygon.bounding_box 298 299 # Get other text region. 300 bounding_other_text_mask = bounding_rectangular_polygon.extract_mask(text_mask).copy() 301 # NOTE: Use the original text region polygon to unset the current text mask. 302 original_text_region_polygon.fill_mask(bounding_other_text_mask, 0) 303 304 # Get protentially dilated text region. 305 bounding_text_mask = Mask.from_shapable(bounding_other_text_mask) 306 bounding_text_mask = bounding_text_mask.to_box_attached(bounding_box) 307 # NOTE: Use the protentially dilated text region polygon to set the current text mask. 308 text_region_polygon.fill_mask(bounding_text_mask, value=1) 309 310 # Should not use the protentially dilated text region polygon anymore. 311 del text_region_polygon 312 313 # Trim protentially dilated text region polygon by eliminating other text region. 314 bounding_trimmed_text_mask = Mask.from_masks( 315 bounding_box, 316 [ 317 # Includes the protentially dilated text region. 318 bounding_text_mask, 319 # But not includes any other text regions. 320 bounding_other_text_mask.to_inverted_mask(), 321 ], 322 ElementSetOperationMode.INTERSECT, 323 ) 324 325 # Get non-text region. 326 bounding_non_text_mask = bounding_rectangular_polygon.extract_mask(non_text_mask) 327 328 # Combine trimmed text region and non-text region. 329 bounding_extended_text_region_mask = Mask.from_masks( 330 bounding_box, 331 [bounding_trimmed_text_mask, bounding_non_text_mask], 332 ) 333 334 bounding_extended_text_region_masks.append(bounding_extended_text_region_mask) 335 bounding_rectangular_polygons.append(bounding_rectangular_polygon) 336 337 return bounding_extended_text_region_masks, bounding_rectangular_polygons 338 339 @classmethod 340 def analyze_bounding_rectangular_polygons( 341 cls, 342 bounding_rectangular_polygons: Sequence[Polygon], 343 ): 344 long_side_ratios: List[float] = [] 345 long_side_angles: List[int] = [] 346 347 for polygon in bounding_rectangular_polygons: 348 # Get reference line. 349 point0, point1, _, point3 = polygon.points 350 side0_length = math.hypot( 351 point0.smooth_y - point1.smooth_y, 352 point0.smooth_x - point1.smooth_x, 353 ) 354 side1_length = math.hypot( 355 point0.smooth_y - point3.smooth_y, 356 point0.smooth_x - point3.smooth_x, 357 ) 358 359 long_side_ratios.append( 360 max(side0_length, side1_length) / min(side0_length, side1_length) 361 ) 362 363 point_a = point0 364 if side0_length > side1_length: 365 # Reference line (p0 -> p1). 366 point_b = point1 367 else: 368 # Reference line (p0 -> p3). 369 point_b = point3 370 371 # Get the angle of reference line, in [0, 180) degree. 372 np_theta = np.arctan2( 373 point_a.smooth_y - point_b.smooth_y, 374 point_a.smooth_x - point_b.smooth_x, 375 ) 376 np_theta = np_theta % np.pi 377 long_side_angle = round(np_theta / np.pi * 180) % 180 378 long_side_angles.append(long_side_angle) 379 380 return long_side_ratios, long_side_angles 381 382 @classmethod 383 def get_typical_angle( 384 cls, 385 typical_long_side_ratio_min: float, 386 long_side_ratios: Sequence[float], 387 long_side_angles: Sequence[int], 388 ): 389 typical_indices: Set[int] = set() 390 typical_long_side_angles: List[float] = [] 391 392 for idx, (long_side_ratio, long_side_angle) in \ 393 enumerate(zip(long_side_ratios, long_side_angles)): 394 if long_side_ratio < typical_long_side_ratio_min: 395 continue 396 397 typical_indices.add(idx) 398 typical_long_side_angles.append(long_side_angle) 399 400 if not typical_long_side_angles: 401 return None, typical_indices 402 403 # NOTE: Due to the sudden change between 179 and 0 degree, 404 # we need to normalize the range to [0, 360) before calculate the mean of angles. 405 two_pi = 2 * np.pi 406 np_angles = np.asarray(typical_long_side_angles) / 180 * two_pi 407 np_sin_mean = np.sin(np_angles).mean() 408 np_cos_mean = np.cos(np_angles).mean() 409 410 np_theta = np.arctan2(np_sin_mean, np_cos_mean) 411 np_theta = np_theta % two_pi 412 # Rescale the range back to [0, 180). 413 typical_angle = round(np_theta / two_pi * 180) 414 415 return typical_angle, typical_indices 416 417 @classmethod 418 def get_flattening_rotate_angles( 419 cls, 420 typical_angle: Optional[int], 421 typical_indices: Set[int], 422 long_side_angles: Sequence[int], 423 ): 424 if typical_angle is not None: 425 assert typical_indices 426 427 flattening_rotate_angles: List[int] = [] 428 429 for idx, long_side_angle in enumerate(long_side_angles): 430 if typical_angle is None or idx in typical_indices: 431 # Dominated by long_side_angle. 432 main_angle = long_side_angle 433 434 else: 435 # Dominated by typical_angle. 436 short_side_angle = (long_side_angle + 90) % 180 437 long_side_delta = abs((long_side_angle - typical_angle + 90) % 180 - 90) 438 short_side_delta = abs((short_side_angle - typical_angle + 90) % 180 - 90) 439 440 if long_side_delta < short_side_delta: 441 main_angle = long_side_angle 442 else: 443 main_angle = short_side_angle 444 445 # Angle for flattening. 446 if main_angle <= 90: 447 # [270, 360). 448 flattening_rotate_angle = (360 - main_angle) % 360 449 else: 450 # [1, 90). 451 flattening_rotate_angle = 180 - main_angle 452 flattening_rotate_angles.append(flattening_rotate_angle) 453 454 return flattening_rotate_angles 455 456 @classmethod 457 def build_flattened_text_regions( 458 cls, 459 image: Image, 460 text_region_polygons: Sequence[Polygon], 461 bounding_extended_text_region_masks: Sequence[Mask], 462 typical_indices: Set[int], 463 flattening_rotate_angles: Sequence[int], 464 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 465 ): 466 flattened_text_regions: List[FlattenedTextRegion] = [] 467 468 for idx, ( 469 text_region_polygon, 470 bounding_extended_text_region_mask, 471 flattening_rotate_angle, 472 ) in enumerate( 473 zip( 474 text_region_polygons, 475 bounding_extended_text_region_masks, 476 flattening_rotate_angles, 477 ) 478 ): 479 bounding_box = bounding_extended_text_region_mask.box 480 assert bounding_box 481 482 # Extract image. 483 text_region_image = bounding_extended_text_region_mask.extract_image(image) 484 485 # Shift char polygons. 486 relative_char_polygons = None 487 if grouped_char_polygons is not None: 488 char_polygons = grouped_char_polygons[idx] 489 relative_char_polygons = [ 490 char_polygon.to_relative_polygon( 491 origin_y=bounding_box.up, 492 origin_x=bounding_box.left, 493 ) for char_polygon in char_polygons 494 ] 495 496 # Rotate. 497 rotated_result = rotate.distort( 498 {'angle': flattening_rotate_angle}, 499 image=text_region_image, 500 mask=bounding_extended_text_region_mask, 501 polygons=relative_char_polygons, 502 ) 503 rotated_text_region_image = rotated_result.image 504 assert rotated_text_region_image 505 rotated_bounding_extended_text_region_mask = rotated_result.mask 506 assert rotated_bounding_extended_text_region_mask 507 # Could be None. 508 rotated_char_polygons = rotated_result.polygons 509 510 # Trim. 511 rotated_trimmed_box = rotated_bounding_extended_text_region_mask.to_external_box() 512 513 trimmed_text_region_image = rotated_text_region_image.to_cropped_image( 514 up=rotated_trimmed_box.up, 515 down=rotated_trimmed_box.down, 516 left=rotated_trimmed_box.left, 517 right=rotated_trimmed_box.right, 518 ) 519 520 trimmed_mask = rotated_trimmed_box.extract_mask( 521 rotated_bounding_extended_text_region_mask 522 ) 523 524 trimmed_char_polygons = None 525 if rotated_char_polygons: 526 trimmed_char_polygons = [ 527 rotated_char_polygon.to_relative_polygon( 528 origin_y=rotated_trimmed_box.up, 529 origin_x=rotated_trimmed_box.left, 530 ) for rotated_char_polygon in rotated_char_polygons 531 ] 532 533 flattened_text_regions.append( 534 FlattenedTextRegion( 535 is_typical=(idx in typical_indices), 536 text_region_polygon=text_region_polygon, 537 text_region_image=bounding_extended_text_region_mask.extract_image(image), 538 bounding_extended_text_region_mask=bounding_extended_text_region_mask, 539 flattening_rotate_angle=flattening_rotate_angle, 540 shape_before_trim=rotated_text_region_image.shape, 541 rotated_trimmed_box=rotated_trimmed_box, 542 shape_before_resize=trimmed_text_region_image.shape, 543 post_rotate_angle=0, 544 flattened_image=trimmed_text_region_image, 545 flattened_mask=trimmed_mask, 546 flattened_char_polygons=trimmed_char_polygons, 547 ) 548 ) 549 550 return flattened_text_regions 551 552 def __init__( 553 self, 554 typical_long_side_ratio_min: float, 555 text_region_polygon_dilate_ratio: float, 556 image: Image, 557 text_region_polygons: Sequence[Polygon], 558 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]] = None, 559 is_training: bool = False, 560 ): 561 self.origional_text_region_polygons = text_region_polygons 562 563 self.text_region_polygons = self.patch_text_region_polygons( 564 text_region_polygons=text_region_polygons, 565 grouped_char_polygons=grouped_char_polygons, 566 ) 567 568 force_no_dilation_flags = None 569 if is_training: 570 assert grouped_char_polygons and len(text_region_polygons) == len(grouped_char_polygons) 571 force_no_dilation_flags = [] 572 for char_polygons in grouped_char_polygons: 573 force_no_dilation_flags.append(not char_polygons) 574 575 self.bounding_extended_text_region_masks, self.bounding_rectangular_polygons = \ 576 self.process_text_region_polygons( 577 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 578 shape=image.shape, 579 text_region_polygons=self.text_region_polygons, 580 force_no_dilation_flags=force_no_dilation_flags, 581 ) 582 583 self.long_side_ratios, self.long_side_angles = \ 584 self.analyze_bounding_rectangular_polygons(self.bounding_rectangular_polygons) 585 586 self.typical_angle, self.typical_indices = self.get_typical_angle( 587 typical_long_side_ratio_min=typical_long_side_ratio_min, 588 long_side_ratios=self.long_side_ratios, 589 long_side_angles=self.long_side_angles, 590 ) 591 592 self.flattening_rotate_angles = self.get_flattening_rotate_angles( 593 typical_angle=self.typical_angle, 594 typical_indices=self.typical_indices, 595 long_side_angles=self.long_side_angles, 596 ) 597 598 self.flattened_text_regions = self.build_flattened_text_regions( 599 image=image, 600 text_region_polygons=self.origional_text_region_polygons, 601 bounding_extended_text_region_masks=self.bounding_extended_text_region_masks, 602 typical_indices=self.typical_indices, 603 flattening_rotate_angles=self.flattening_rotate_angles, 604 grouped_char_polygons=grouped_char_polygons, 605 )
552 def __init__( 553 self, 554 typical_long_side_ratio_min: float, 555 text_region_polygon_dilate_ratio: float, 556 image: Image, 557 text_region_polygons: Sequence[Polygon], 558 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]] = None, 559 is_training: bool = False, 560 ): 561 self.origional_text_region_polygons = text_region_polygons 562 563 self.text_region_polygons = self.patch_text_region_polygons( 564 text_region_polygons=text_region_polygons, 565 grouped_char_polygons=grouped_char_polygons, 566 ) 567 568 force_no_dilation_flags = None 569 if is_training: 570 assert grouped_char_polygons and len(text_region_polygons) == len(grouped_char_polygons) 571 force_no_dilation_flags = [] 572 for char_polygons in grouped_char_polygons: 573 force_no_dilation_flags.append(not char_polygons) 574 575 self.bounding_extended_text_region_masks, self.bounding_rectangular_polygons = \ 576 self.process_text_region_polygons( 577 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 578 shape=image.shape, 579 text_region_polygons=self.text_region_polygons, 580 force_no_dilation_flags=force_no_dilation_flags, 581 ) 582 583 self.long_side_ratios, self.long_side_angles = \ 584 self.analyze_bounding_rectangular_polygons(self.bounding_rectangular_polygons) 585 586 self.typical_angle, self.typical_indices = self.get_typical_angle( 587 typical_long_side_ratio_min=typical_long_side_ratio_min, 588 long_side_ratios=self.long_side_ratios, 589 long_side_angles=self.long_side_angles, 590 ) 591 592 self.flattening_rotate_angles = self.get_flattening_rotate_angles( 593 typical_angle=self.typical_angle, 594 typical_indices=self.typical_indices, 595 long_side_angles=self.long_side_angles, 596 ) 597 598 self.flattened_text_regions = self.build_flattened_text_regions( 599 image=image, 600 text_region_polygons=self.origional_text_region_polygons, 601 bounding_extended_text_region_masks=self.bounding_extended_text_region_masks, 602 typical_indices=self.typical_indices, 603 flattening_rotate_angles=self.flattening_rotate_angles, 604 grouped_char_polygons=grouped_char_polygons, 605 )
231 @classmethod 232 def patch_text_region_polygons( 233 cls, 234 text_region_polygons: Sequence[Polygon], 235 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 236 ): 237 if grouped_char_polygons is None: 238 return text_region_polygons 239 240 assert len(text_region_polygons) == len(grouped_char_polygons) 241 242 patched_text_region_polygons: List[Polygon] = [] 243 for text_region_polygon, char_polygons in zip(text_region_polygons, grouped_char_polygons): 244 # Need to make sure all char polygons are included. 245 unionized_polygons = [text_region_polygon] 246 unionized_polygons.extend(char_polygons) 247 248 bounding_box = Box.from_boxes((polygon.bounding_box for polygon in unionized_polygons)) 249 mask = Mask.from_shapable(bounding_box).to_box_attached(bounding_box) 250 for polygon in unionized_polygons: 251 polygon.fill_mask(mask) 252 253 patched_text_region_polygons.append(mask.to_external_polygon()) 254 255 return patched_text_region_polygons
257 @classmethod 258 def process_text_region_polygons( 259 cls, 260 text_region_polygon_dilate_ratio: float, 261 shape: Tuple[int, int], 262 text_region_polygons: Sequence[Polygon], 263 force_no_dilation_flags: Optional[Sequence[bool]] = None, 264 ): 265 text_mask = Mask.from_polygons(shape, text_region_polygons) 266 non_text_mask = text_mask.to_inverted_mask() 267 268 box = Box.from_shape(shape) 269 text_mask = text_mask.to_box_attached(box) 270 non_text_mask = non_text_mask.to_box_attached(box) 271 272 bounding_extended_text_region_masks: List[Mask] = [] 273 bounding_rectangular_polygons: List[Polygon] = [] 274 275 if force_no_dilation_flags is None: 276 force_no_dilation_flags_iter = itertools.repeat(False) 277 else: 278 assert len(force_no_dilation_flags) == len(text_region_polygons) 279 force_no_dilation_flags_iter = force_no_dilation_flags 280 281 for text_region_polygon, force_no_dilation_flag in zip( 282 text_region_polygons, force_no_dilation_flags_iter 283 ): 284 original_text_region_polygon = text_region_polygon 285 286 if not force_no_dilation_flag: 287 # Dilate. 288 text_region_polygon = text_region_polygon.to_dilated_polygon( 289 ratio=text_region_polygon_dilate_ratio, 290 ) 291 text_region_polygon = text_region_polygon.to_clipped_polygon(shape) 292 293 # Get bounding rectangular box (polygon). 294 bounding_rectangular_polygon = \ 295 text_region_polygon.to_bounding_rectangular_polygon(shape) 296 297 bounding_box = bounding_rectangular_polygon.bounding_box 298 299 # Get other text region. 300 bounding_other_text_mask = bounding_rectangular_polygon.extract_mask(text_mask).copy() 301 # NOTE: Use the original text region polygon to unset the current text mask. 302 original_text_region_polygon.fill_mask(bounding_other_text_mask, 0) 303 304 # Get protentially dilated text region. 305 bounding_text_mask = Mask.from_shapable(bounding_other_text_mask) 306 bounding_text_mask = bounding_text_mask.to_box_attached(bounding_box) 307 # NOTE: Use the protentially dilated text region polygon to set the current text mask. 308 text_region_polygon.fill_mask(bounding_text_mask, value=1) 309 310 # Should not use the protentially dilated text region polygon anymore. 311 del text_region_polygon 312 313 # Trim protentially dilated text region polygon by eliminating other text region. 314 bounding_trimmed_text_mask = Mask.from_masks( 315 bounding_box, 316 [ 317 # Includes the protentially dilated text region. 318 bounding_text_mask, 319 # But not includes any other text regions. 320 bounding_other_text_mask.to_inverted_mask(), 321 ], 322 ElementSetOperationMode.INTERSECT, 323 ) 324 325 # Get non-text region. 326 bounding_non_text_mask = bounding_rectangular_polygon.extract_mask(non_text_mask) 327 328 # Combine trimmed text region and non-text region. 329 bounding_extended_text_region_mask = Mask.from_masks( 330 bounding_box, 331 [bounding_trimmed_text_mask, bounding_non_text_mask], 332 ) 333 334 bounding_extended_text_region_masks.append(bounding_extended_text_region_mask) 335 bounding_rectangular_polygons.append(bounding_rectangular_polygon) 336 337 return bounding_extended_text_region_masks, bounding_rectangular_polygons
339 @classmethod 340 def analyze_bounding_rectangular_polygons( 341 cls, 342 bounding_rectangular_polygons: Sequence[Polygon], 343 ): 344 long_side_ratios: List[float] = [] 345 long_side_angles: List[int] = [] 346 347 for polygon in bounding_rectangular_polygons: 348 # Get reference line. 349 point0, point1, _, point3 = polygon.points 350 side0_length = math.hypot( 351 point0.smooth_y - point1.smooth_y, 352 point0.smooth_x - point1.smooth_x, 353 ) 354 side1_length = math.hypot( 355 point0.smooth_y - point3.smooth_y, 356 point0.smooth_x - point3.smooth_x, 357 ) 358 359 long_side_ratios.append( 360 max(side0_length, side1_length) / min(side0_length, side1_length) 361 ) 362 363 point_a = point0 364 if side0_length > side1_length: 365 # Reference line (p0 -> p1). 366 point_b = point1 367 else: 368 # Reference line (p0 -> p3). 369 point_b = point3 370 371 # Get the angle of reference line, in [0, 180) degree. 372 np_theta = np.arctan2( 373 point_a.smooth_y - point_b.smooth_y, 374 point_a.smooth_x - point_b.smooth_x, 375 ) 376 np_theta = np_theta % np.pi 377 long_side_angle = round(np_theta / np.pi * 180) % 180 378 long_side_angles.append(long_side_angle) 379 380 return long_side_ratios, long_side_angles
382 @classmethod 383 def get_typical_angle( 384 cls, 385 typical_long_side_ratio_min: float, 386 long_side_ratios: Sequence[float], 387 long_side_angles: Sequence[int], 388 ): 389 typical_indices: Set[int] = set() 390 typical_long_side_angles: List[float] = [] 391 392 for idx, (long_side_ratio, long_side_angle) in \ 393 enumerate(zip(long_side_ratios, long_side_angles)): 394 if long_side_ratio < typical_long_side_ratio_min: 395 continue 396 397 typical_indices.add(idx) 398 typical_long_side_angles.append(long_side_angle) 399 400 if not typical_long_side_angles: 401 return None, typical_indices 402 403 # NOTE: Due to the sudden change between 179 and 0 degree, 404 # we need to normalize the range to [0, 360) before calculate the mean of angles. 405 two_pi = 2 * np.pi 406 np_angles = np.asarray(typical_long_side_angles) / 180 * two_pi 407 np_sin_mean = np.sin(np_angles).mean() 408 np_cos_mean = np.cos(np_angles).mean() 409 410 np_theta = np.arctan2(np_sin_mean, np_cos_mean) 411 np_theta = np_theta % two_pi 412 # Rescale the range back to [0, 180). 413 typical_angle = round(np_theta / two_pi * 180) 414 415 return typical_angle, typical_indices
417 @classmethod 418 def get_flattening_rotate_angles( 419 cls, 420 typical_angle: Optional[int], 421 typical_indices: Set[int], 422 long_side_angles: Sequence[int], 423 ): 424 if typical_angle is not None: 425 assert typical_indices 426 427 flattening_rotate_angles: List[int] = [] 428 429 for idx, long_side_angle in enumerate(long_side_angles): 430 if typical_angle is None or idx in typical_indices: 431 # Dominated by long_side_angle. 432 main_angle = long_side_angle 433 434 else: 435 # Dominated by typical_angle. 436 short_side_angle = (long_side_angle + 90) % 180 437 long_side_delta = abs((long_side_angle - typical_angle + 90) % 180 - 90) 438 short_side_delta = abs((short_side_angle - typical_angle + 90) % 180 - 90) 439 440 if long_side_delta < short_side_delta: 441 main_angle = long_side_angle 442 else: 443 main_angle = short_side_angle 444 445 # Angle for flattening. 446 if main_angle <= 90: 447 # [270, 360). 448 flattening_rotate_angle = (360 - main_angle) % 360 449 else: 450 # [1, 90). 451 flattening_rotate_angle = 180 - main_angle 452 flattening_rotate_angles.append(flattening_rotate_angle) 453 454 return flattening_rotate_angles
456 @classmethod 457 def build_flattened_text_regions( 458 cls, 459 image: Image, 460 text_region_polygons: Sequence[Polygon], 461 bounding_extended_text_region_masks: Sequence[Mask], 462 typical_indices: Set[int], 463 flattening_rotate_angles: Sequence[int], 464 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 465 ): 466 flattened_text_regions: List[FlattenedTextRegion] = [] 467 468 for idx, ( 469 text_region_polygon, 470 bounding_extended_text_region_mask, 471 flattening_rotate_angle, 472 ) in enumerate( 473 zip( 474 text_region_polygons, 475 bounding_extended_text_region_masks, 476 flattening_rotate_angles, 477 ) 478 ): 479 bounding_box = bounding_extended_text_region_mask.box 480 assert bounding_box 481 482 # Extract image. 483 text_region_image = bounding_extended_text_region_mask.extract_image(image) 484 485 # Shift char polygons. 486 relative_char_polygons = None 487 if grouped_char_polygons is not None: 488 char_polygons = grouped_char_polygons[idx] 489 relative_char_polygons = [ 490 char_polygon.to_relative_polygon( 491 origin_y=bounding_box.up, 492 origin_x=bounding_box.left, 493 ) for char_polygon in char_polygons 494 ] 495 496 # Rotate. 497 rotated_result = rotate.distort( 498 {'angle': flattening_rotate_angle}, 499 image=text_region_image, 500 mask=bounding_extended_text_region_mask, 501 polygons=relative_char_polygons, 502 ) 503 rotated_text_region_image = rotated_result.image 504 assert rotated_text_region_image 505 rotated_bounding_extended_text_region_mask = rotated_result.mask 506 assert rotated_bounding_extended_text_region_mask 507 # Could be None. 508 rotated_char_polygons = rotated_result.polygons 509 510 # Trim. 511 rotated_trimmed_box = rotated_bounding_extended_text_region_mask.to_external_box() 512 513 trimmed_text_region_image = rotated_text_region_image.to_cropped_image( 514 up=rotated_trimmed_box.up, 515 down=rotated_trimmed_box.down, 516 left=rotated_trimmed_box.left, 517 right=rotated_trimmed_box.right, 518 ) 519 520 trimmed_mask = rotated_trimmed_box.extract_mask( 521 rotated_bounding_extended_text_region_mask 522 ) 523 524 trimmed_char_polygons = None 525 if rotated_char_polygons: 526 trimmed_char_polygons = [ 527 rotated_char_polygon.to_relative_polygon( 528 origin_y=rotated_trimmed_box.up, 529 origin_x=rotated_trimmed_box.left, 530 ) for rotated_char_polygon in rotated_char_polygons 531 ] 532 533 flattened_text_regions.append( 534 FlattenedTextRegion( 535 is_typical=(idx in typical_indices), 536 text_region_polygon=text_region_polygon, 537 text_region_image=bounding_extended_text_region_mask.extract_image(image), 538 bounding_extended_text_region_mask=bounding_extended_text_region_mask, 539 flattening_rotate_angle=flattening_rotate_angle, 540 shape_before_trim=rotated_text_region_image.shape, 541 rotated_trimmed_box=rotated_trimmed_box, 542 shape_before_resize=trimmed_text_region_image.shape, 543 post_rotate_angle=0, 544 flattened_image=trimmed_text_region_image, 545 flattened_mask=trimmed_mask, 546 flattened_char_polygons=trimmed_char_polygons, 547 ) 548 ) 549 550 return flattened_text_regions
608def build_background_image_for_stacking(height: int, width: int): 609 np_rgb_rows = [np.zeros((width, 3), dtype=np.uint8) for _ in range(3)] 610 rgb_tuples = [(255, 0, 0), (0, 255, 0), (0, 0, 255)] 611 612 for color_offset, np_row in enumerate(np_rgb_rows): 613 for color_idx in range(3): 614 color_tuple = rgb_tuples[(color_offset + color_idx) % 3] 615 np_row[color_idx::3] = color_tuple 616 617 np_image = np.zeros((height, width, 3), dtype=np.uint8) 618 for row_offset, np_row in enumerate(np_rgb_rows): 619 np_image[row_offset::3] = np_row 620 621 return Image(mat=np_image)
624def stack_flattened_text_regions( 625 page_pad: int, 626 flattened_text_regions_pad: int, 627 flattened_text_regions: Sequence[FlattenedTextRegion], 628): 629 page_double_pad = 2 * page_pad 630 flattened_text_regions_double_pad = 2 * flattened_text_regions_pad 631 632 rect_packer = RectPacker(rotation=False) 633 634 # Add box and bin. 635 # NOTE: Only one bin is added, that is, packing all text region into one image. 636 bin_width = 0 637 bin_height = 0 638 639 for ftr_idx, flattened_text_region in enumerate(flattened_text_regions): 640 rect_packer.add_rect( 641 width=flattened_text_region.width + flattened_text_regions_double_pad, 642 height=flattened_text_region.height + flattened_text_regions_double_pad, 643 rid=ftr_idx, 644 ) 645 646 bin_width = max(bin_width, flattened_text_region.width) 647 bin_height += flattened_text_region.height 648 649 bin_width += flattened_text_regions_double_pad 650 bin_height += flattened_text_regions_double_pad 651 652 rect_packer.add_bin(width=bin_width, height=bin_height) 653 654 # Pack boxes. 655 rect_packer.pack() # type: ignore 656 657 # Get packed boxes. 658 unordered_boxes: List[Box] = [] 659 ftr_indices: List[int] = [] 660 for bin_idx, x, y, width, height, ftr_idx in rect_packer.rect_list(): 661 assert bin_idx == 0 662 unordered_boxes.append(Box( 663 up=y, 664 down=y + height - 1, 665 left=x, 666 right=x + width - 1, 667 )) 668 ftr_indices.append(ftr_idx) 669 670 # Order boxes. 671 inverse_ftr_indices = [-1] * len(ftr_indices) 672 for inverse_ftr_idx, ftr_idx in enumerate(ftr_indices): 673 inverse_ftr_indices[ftr_idx] = inverse_ftr_idx 674 for inverse_ftr_idx in inverse_ftr_indices: 675 assert inverse_ftr_idx >= 0 676 padded_boxes = [unordered_boxes[inverse_ftr_idx] for inverse_ftr_idx in inverse_ftr_indices] 677 678 page_height = max(box.down for box in padded_boxes) + 1 + page_double_pad 679 page_width = max(box.right for box in padded_boxes) + 1 + page_double_pad 680 681 image = build_background_image_for_stacking(page_height, page_width) 682 boxes: List[Box] = [] 683 char_polygons: List[Polygon] = [] 684 685 for padded_box, flattened_text_region in zip(padded_boxes, flattened_text_regions): 686 assert flattened_text_region.height + flattened_text_regions_double_pad \ 687 == padded_box.height 688 assert flattened_text_region.width + flattened_text_regions_double_pad \ 689 == padded_box.width 690 691 # Remove box padding. 692 up = padded_box.up + flattened_text_regions_pad + page_pad 693 left = padded_box.left + flattened_text_regions_pad + page_pad 694 695 box = Box( 696 up=up, 697 down=up + flattened_text_region.height - 1, 698 left=left, 699 right=left + flattened_text_region.width - 1, 700 ) 701 boxes.append(box) 702 703 # Render. 704 box.fill_image( 705 image, 706 flattened_text_region.flattened_image, 707 image_mask=flattened_text_region.flattened_mask, 708 ) 709 710 if flattened_text_region.flattened_char_polygons: 711 for char_polygon in flattened_text_region.flattened_char_polygons: 712 char_polygons.append(char_polygon.to_shifted_polygon( 713 offset_y=up, 714 offset_x=left, 715 )) 716 717 return image, boxes, char_polygons
720class PageTextRegionStep( 721 PipelineStep[ 722 PageTextRegionStepConfig, 723 PageTextRegionStepInput, 724 PageTextRegionStepOutput, 725 ] 726): # yapf: disable 727 728 @classmethod 729 def generate_precise_text_region_candidate_polygons( 730 cls, 731 precise_mask: Mask, 732 disconnected_text_region_mask: Mask, 733 ): 734 assert precise_mask.box and disconnected_text_region_mask.box 735 736 # Get the intersection. 737 intersected_box = Box( 738 up=max(precise_mask.box.up, disconnected_text_region_mask.box.up), 739 down=min(precise_mask.box.down, disconnected_text_region_mask.box.down), 740 left=max(precise_mask.box.left, disconnected_text_region_mask.box.left), 741 right=min(precise_mask.box.right, disconnected_text_region_mask.box.right), 742 ) 743 assert intersected_box.up <= intersected_box.down 744 assert intersected_box.left <= intersected_box.right 745 746 precise_mask = intersected_box.extract_mask(precise_mask) 747 disconnected_text_region_mask = intersected_box.extract_mask(disconnected_text_region_mask) 748 749 # Apply mask bitwise-and operation. 750 intersected_mask = Mask( 751 mat=(disconnected_text_region_mask.mat & precise_mask.mat).astype(np.uint8) 752 ) 753 intersected_mask = intersected_mask.to_box_attached(intersected_box) 754 755 # NOTE: 756 # 1. Could extract more than one polygons. 757 # 2. Some polygons are in border and should be removed later. 758 return intersected_mask.to_disconnected_polygons() 759 760 @classmethod 761 def strtree_query_intersected_polygons( 762 cls, 763 strtree: STRtree, 764 id_to_anchor_polygon: Dict[int, Polygon], 765 candidate_polygon: Polygon, 766 ): 767 candidate_shapely_polygon = candidate_polygon.to_shapely_polygon() 768 candidate_mask = candidate_polygon.mask 769 770 for anchor_shapely_polygon in strtree.query(candidate_shapely_polygon): 771 anchor_id = id(anchor_shapely_polygon) 772 anchor_polygon = id_to_anchor_polygon[anchor_id] 773 anchor_mask = anchor_polygon.mask 774 775 intersected_ratio = calculate_boxed_masks_intersected_ratio( 776 anchor_mask=anchor_mask, 777 candidate_mask=candidate_mask, 778 use_candidate_as_base=True, 779 ) 780 781 yield ( 782 anchor_id, 783 anchor_polygon, 784 anchor_mask, 785 candidate_mask, 786 intersected_ratio, 787 ) 788 789 def sample_page_non_text_region_polygons( 790 self, 791 page_non_text_region_polygons: Sequence[Polygon], 792 num_page_text_region_infos: int, 793 rng: RandomGenerator, 794 ): 795 negative_ratio = self.config.negative_text_region_ratio 796 num_page_non_text_region_polygons = round( 797 negative_ratio * num_page_text_region_infos / (1 - negative_ratio) 798 ) 799 return rng_choice_with_size( 800 rng, 801 page_non_text_region_polygons, 802 size=min( 803 num_page_non_text_region_polygons, 804 len(page_non_text_region_polygons), 805 ), 806 replace=False, 807 ) 808 809 def build_flattened_text_regions( 810 self, 811 page_image: Image, 812 page_text_region_infos: Sequence[PageTextRegionInfo], 813 page_non_text_region_polygons: Sequence[Polygon], 814 rng: RandomGenerator, 815 ): 816 text_region_polygon_dilate_ratio = float( 817 rng.uniform( 818 self.config.text_region_flattener_text_region_polygon_dilate_ratio_min, 819 self.config.text_region_flattener_text_region_polygon_dilate_ratio_max, 820 ) 821 ) 822 typical_long_side_ratio_min = \ 823 self.config.text_region_flattener_typical_long_side_ratio_min 824 825 text_region_polygons: List[Polygon] = [] 826 grouped_char_polygons: List[Sequence[Polygon]] = [] 827 for page_text_region_info in page_text_region_infos: 828 text_region_polygons.append(page_text_region_info.precise_text_region_polygon) 829 grouped_char_polygons.append(page_text_region_info.char_polygons) 830 831 # Inject nagative regions. 832 for page_non_text_region_polygon in page_non_text_region_polygons: 833 text_region_polygons.append(page_non_text_region_polygon) 834 grouped_char_polygons.append(tuple()) 835 836 text_region_flattener = TextRegionFlattener( 837 typical_long_side_ratio_min=typical_long_side_ratio_min, 838 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 839 image=page_image, 840 text_region_polygons=text_region_polygons, 841 grouped_char_polygons=grouped_char_polygons, 842 is_training=True, 843 ) 844 845 # Resize positive ftr. 846 positive_flattened_text_regions: List[FlattenedTextRegion] = [] 847 # For negative sampling. 848 positive_reference_heights: List[float] = [] 849 positive_reference_widths: List[float] = [] 850 num_negative_flattened_text_regions = 0 851 852 for flattened_text_region in text_region_flattener.flattened_text_regions: 853 if not flattened_text_region.flattened_char_polygons: 854 num_negative_flattened_text_regions += 1 855 continue 856 857 char_height_median = flattened_text_region.get_char_height_meidan() 858 859 text_region_resize_char_height_median = int( 860 rng.integers( 861 self.config.text_region_resize_char_height_median_min, 862 self.config.text_region_resize_char_height_median_max + 1, 863 ) 864 ) 865 scale = text_region_resize_char_height_median / char_height_median 866 867 height, width = flattened_text_region.shape 868 resized_height = round(height * scale) 869 resized_width = round(width * scale) 870 871 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 872 resized_height=resized_height, 873 resized_width=resized_width, 874 ) 875 876 positive_reference_heights.append(resized_height) 877 positive_reference_widths.append(resized_width) 878 879 # Post rotate. 880 post_rotate_angle = 0 881 if flattened_text_region.is_typical: 882 if rng.random() < self.config.text_region_typical_post_rotate_prob: 883 # Upside down only. 884 post_rotate_angle = 180 885 else: 886 if rng.random() < self.config.text_region_untypical_post_rotate_prob: 887 # 3-way rotate. 888 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 889 890 if post_rotate_angle != 0: 891 flattened_text_region = \ 892 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 893 894 positive_flattened_text_regions.append(flattened_text_region) 895 896 # Resize negative ftr. 897 negative_reference_heights = list( 898 rng_choice_with_size( 899 rng, 900 positive_reference_heights, 901 size=num_negative_flattened_text_regions, 902 replace=(num_negative_flattened_text_regions > len(positive_reference_heights)), 903 ) 904 ) 905 906 negative_height_max = max(positive_reference_heights) 907 negative_width_max = max(positive_reference_widths) 908 909 negative_flattened_text_regions: List[FlattenedTextRegion] = [] 910 911 for flattened_text_region in text_region_flattener.flattened_text_regions: 912 if flattened_text_region.flattened_char_polygons: 913 continue 914 915 reference_height = negative_reference_heights.pop() 916 scale = reference_height / flattened_text_region.height 917 918 height, width = flattened_text_region.shape 919 resized_height = round(height * scale) 920 resized_width = round(width * scale) 921 922 # Remove negative region that is too large. 923 if resized_height > negative_height_max or resized_width > negative_width_max: 924 continue 925 926 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 927 resized_height=resized_height, 928 resized_width=resized_width, 929 ) 930 931 # Post rotate. 932 post_rotate_angle = 0 933 if flattened_text_region.is_typical: 934 if rng.random() < self.config.text_region_typical_post_rotate_prob: 935 # Upside down only. 936 post_rotate_angle = 180 937 else: 938 if rng.random() < self.config.text_region_untypical_post_rotate_prob: 939 # 3-way rotate. 940 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 941 942 if post_rotate_angle != 0: 943 flattened_text_region = \ 944 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 945 946 negative_flattened_text_regions.append(flattened_text_region) 947 948 flattened_text_regions = ( 949 *positive_flattened_text_regions, 950 *negative_flattened_text_regions, 951 ) 952 return flattened_text_regions 953 954 def run(self, input: PageTextRegionStepInput, rng: RandomGenerator): 955 page_distortion_step_output = input.page_distortion_step_output 956 page_image = page_distortion_step_output.page_image 957 page_char_polygon_collection = page_distortion_step_output.page_char_polygon_collection 958 page_disconnected_text_region_collection = \ 959 page_distortion_step_output.page_disconnected_text_region_collection 960 page_non_text_region_collection = \ 961 page_distortion_step_output.page_non_text_region_collection 962 963 page_resizing_step_output = input.page_resizing_step_output 964 page_resized_text_line_mask = page_resizing_step_output.page_text_line_mask 965 966 debug = None 967 if self.config.enable_debug: 968 debug = PageTextRegionStepDebug() 969 970 # Build R-tree to track text regions. 971 # https://github.com/shapely/shapely/issues/640 972 id_to_disconnected_text_region_polygon: Dict[int, Polygon] = {} 973 disconnected_text_region_shapely_polygons: List[ShapelyPolygon] = [] 974 975 for polygon in page_disconnected_text_region_collection.to_polygons(): 976 shapely_polygon = polygon.to_shapely_polygon() 977 id_to_disconnected_text_region_polygon[id(shapely_polygon)] = polygon 978 disconnected_text_region_shapely_polygons.append(shapely_polygon) 979 980 disconnected_text_region_tree = STRtree(disconnected_text_region_shapely_polygons) 981 982 # Get the precise text regions. 983 precise_text_region_candidate_polygons: List[Polygon] = [] 984 for resized_precise_polygon in page_resized_text_line_mask.to_disconnected_polygons(): 985 # Resize back to the shape after distortion. 986 precise_polygon = resized_precise_polygon.to_conducted_resized_polygon( 987 page_resized_text_line_mask, 988 resized_height=page_image.height, 989 resized_width=page_image.width, 990 ) 991 992 # Find and extract intersected text region. 993 # NOTE: One precise_polygon could be overlapped with 994 # more than one disconnected_text_region_polygon! 995 for _, _, disconnected_text_region_mask, precise_mask, _ in \ 996 self.strtree_query_intersected_polygons( 997 strtree=disconnected_text_region_tree, 998 id_to_anchor_polygon=id_to_disconnected_text_region_polygon, 999 candidate_polygon=precise_polygon, 1000 ): 1001 precise_text_region_candidate_polygons.extend( 1002 self.generate_precise_text_region_candidate_polygons( 1003 precise_mask=precise_mask, 1004 disconnected_text_region_mask=disconnected_text_region_mask, 1005 ) 1006 ) 1007 1008 if debug: 1009 debug.page_image = page_image 1010 debug.precise_text_region_candidate_polygons = precise_text_region_candidate_polygons 1011 1012 # Help gc. 1013 del id_to_disconnected_text_region_polygon 1014 del disconnected_text_region_shapely_polygons 1015 del disconnected_text_region_tree 1016 1017 # Bind char-level polygon to precise text region. 1018 id_to_precise_text_region_polygon: Dict[int, Polygon] = {} 1019 precise_text_region_shapely_polygons: List[ShapelyPolygon] = [] 1020 1021 for polygon in precise_text_region_candidate_polygons: 1022 shapely_polygon = polygon.to_shapely_polygon() 1023 id_to_precise_text_region_polygon[id(shapely_polygon)] = polygon 1024 precise_text_region_shapely_polygons.append(shapely_polygon) 1025 1026 precise_text_region_tree = STRtree(precise_text_region_shapely_polygons) 1027 1028 id_to_char_polygons: DefaultDict[int, List[Polygon]] = defaultdict(list) 1029 for char_polygon in page_char_polygon_collection.polygons: 1030 best_precise_text_region_id = None 1031 intersected_ratio_max = 0 1032 1033 for ( 1034 precise_text_region_id, 1035 _, 1036 _, 1037 _, 1038 intersected_ratio, 1039 ) in self.strtree_query_intersected_polygons( 1040 strtree=precise_text_region_tree, 1041 id_to_anchor_polygon=id_to_precise_text_region_polygon, 1042 candidate_polygon=char_polygon, 1043 ): 1044 if intersected_ratio > intersected_ratio_max: 1045 intersected_ratio_max = intersected_ratio 1046 best_precise_text_region_id = precise_text_region_id 1047 1048 if best_precise_text_region_id is not None: 1049 id_to_char_polygons[best_precise_text_region_id].append(char_polygon) 1050 else: 1051 # NOTE: Text line with only a small char (i.e. delimiter) could enter this branch. 1052 # In such case, the text line bounding box is smaller than the char polygon, since 1053 # the leading/trailing char paddings are ignored during text line rendering. 1054 # It's acceptable for now since: 1) this case happens rarely, 2) and it won't 1055 # introduce labeling noise. 1056 logger.warning(f'Cannot assign a text region for char_polygon={char_polygon}') 1057 1058 page_text_region_infos: List[PageTextRegionInfo] = [] 1059 for precise_text_region_shapely_polygon in precise_text_region_shapely_polygons: 1060 ptrsp_id = id(precise_text_region_shapely_polygon) 1061 if ptrsp_id not in id_to_char_polygons: 1062 # Not related to any char polygons. 1063 continue 1064 assert id_to_char_polygons[ptrsp_id] 1065 page_text_region_infos.append( 1066 PageTextRegionInfo( 1067 precise_text_region_polygon=id_to_precise_text_region_polygon[ptrsp_id], 1068 char_polygons=id_to_char_polygons[ptrsp_id], 1069 ) 1070 ) 1071 1072 # Help gc. 1073 del id_to_precise_text_region_polygon 1074 del precise_text_region_shapely_polygons 1075 del precise_text_region_tree 1076 1077 if debug: 1078 debug.page_text_region_infos = page_text_region_infos 1079 1080 # Negative sampling. 1081 page_non_text_region_polygons = self.sample_page_non_text_region_polygons( 1082 page_non_text_region_polygons=tuple(page_non_text_region_collection.to_polygons()), 1083 num_page_text_region_infos=len(page_text_region_infos), 1084 rng=rng, 1085 ) 1086 1087 flattened_text_regions = self.build_flattened_text_regions( 1088 page_image=page_image, 1089 page_text_region_infos=page_text_region_infos, 1090 page_non_text_region_polygons=page_non_text_region_polygons, 1091 rng=rng, 1092 ) 1093 if debug: 1094 debug.flattened_text_regions = flattened_text_regions 1095 1096 # Stack text regions. 1097 image, _, char_polygons = stack_flattened_text_regions( 1098 page_pad=0, 1099 flattened_text_regions_pad=self.config.stack_flattened_text_regions_pad, 1100 flattened_text_regions=flattened_text_regions, 1101 ) 1102 1103 # Post uniform rotation. 1104 shape_before_rotate = image.shape 1105 rotate_angle = 0 1106 1107 if self.config.enable_post_rotate: 1108 rotate_angle = int( 1109 rng.integers( 1110 self.config.post_rotate_angle_min, 1111 self.config.post_rotate_angle_max + 1, 1112 ) 1113 ) 1114 rotated_result = rotate.distort( 1115 {'angle': rotate_angle}, 1116 image=image, 1117 polygons=char_polygons, 1118 ) 1119 assert rotated_result.image and rotated_result.polygons 1120 image = rotated_result.image 1121 char_polygons = rotated_result.polygons 1122 1123 return PageTextRegionStepOutput( 1124 page_image=image, 1125 page_char_polygons=char_polygons, 1126 shape_before_rotate=shape_before_rotate, 1127 rotate_angle=rotate_angle, 1128 debug=debug, 1129 )
Abstract base class for generic types.
A generic type is typically declared by inheriting from this class parameterized with one or more type variables. For example, a generic mapping type might be defined as::
class Mapping(Generic[KT, VT]): def __getitem__(self, key: KT) -> VT: ... # Etc.
This class can then be used as follows::
def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT: try: return mapping[key] except KeyError: return default
728 @classmethod 729 def generate_precise_text_region_candidate_polygons( 730 cls, 731 precise_mask: Mask, 732 disconnected_text_region_mask: Mask, 733 ): 734 assert precise_mask.box and disconnected_text_region_mask.box 735 736 # Get the intersection. 737 intersected_box = Box( 738 up=max(precise_mask.box.up, disconnected_text_region_mask.box.up), 739 down=min(precise_mask.box.down, disconnected_text_region_mask.box.down), 740 left=max(precise_mask.box.left, disconnected_text_region_mask.box.left), 741 right=min(precise_mask.box.right, disconnected_text_region_mask.box.right), 742 ) 743 assert intersected_box.up <= intersected_box.down 744 assert intersected_box.left <= intersected_box.right 745 746 precise_mask = intersected_box.extract_mask(precise_mask) 747 disconnected_text_region_mask = intersected_box.extract_mask(disconnected_text_region_mask) 748 749 # Apply mask bitwise-and operation. 750 intersected_mask = Mask( 751 mat=(disconnected_text_region_mask.mat & precise_mask.mat).astype(np.uint8) 752 ) 753 intersected_mask = intersected_mask.to_box_attached(intersected_box) 754 755 # NOTE: 756 # 1. Could extract more than one polygons. 757 # 2. Some polygons are in border and should be removed later. 758 return intersected_mask.to_disconnected_polygons()
760 @classmethod 761 def strtree_query_intersected_polygons( 762 cls, 763 strtree: STRtree, 764 id_to_anchor_polygon: Dict[int, Polygon], 765 candidate_polygon: Polygon, 766 ): 767 candidate_shapely_polygon = candidate_polygon.to_shapely_polygon() 768 candidate_mask = candidate_polygon.mask 769 770 for anchor_shapely_polygon in strtree.query(candidate_shapely_polygon): 771 anchor_id = id(anchor_shapely_polygon) 772 anchor_polygon = id_to_anchor_polygon[anchor_id] 773 anchor_mask = anchor_polygon.mask 774 775 intersected_ratio = calculate_boxed_masks_intersected_ratio( 776 anchor_mask=anchor_mask, 777 candidate_mask=candidate_mask, 778 use_candidate_as_base=True, 779 ) 780 781 yield ( 782 anchor_id, 783 anchor_polygon, 784 anchor_mask, 785 candidate_mask, 786 intersected_ratio, 787 )
789 def sample_page_non_text_region_polygons( 790 self, 791 page_non_text_region_polygons: Sequence[Polygon], 792 num_page_text_region_infos: int, 793 rng: RandomGenerator, 794 ): 795 negative_ratio = self.config.negative_text_region_ratio 796 num_page_non_text_region_polygons = round( 797 negative_ratio * num_page_text_region_infos / (1 - negative_ratio) 798 ) 799 return rng_choice_with_size( 800 rng, 801 page_non_text_region_polygons, 802 size=min( 803 num_page_non_text_region_polygons, 804 len(page_non_text_region_polygons), 805 ), 806 replace=False, 807 )
809 def build_flattened_text_regions( 810 self, 811 page_image: Image, 812 page_text_region_infos: Sequence[PageTextRegionInfo], 813 page_non_text_region_polygons: Sequence[Polygon], 814 rng: RandomGenerator, 815 ): 816 text_region_polygon_dilate_ratio = float( 817 rng.uniform( 818 self.config.text_region_flattener_text_region_polygon_dilate_ratio_min, 819 self.config.text_region_flattener_text_region_polygon_dilate_ratio_max, 820 ) 821 ) 822 typical_long_side_ratio_min = \ 823 self.config.text_region_flattener_typical_long_side_ratio_min 824 825 text_region_polygons: List[Polygon] = [] 826 grouped_char_polygons: List[Sequence[Polygon]] = [] 827 for page_text_region_info in page_text_region_infos: 828 text_region_polygons.append(page_text_region_info.precise_text_region_polygon) 829 grouped_char_polygons.append(page_text_region_info.char_polygons) 830 831 # Inject nagative regions. 832 for page_non_text_region_polygon in page_non_text_region_polygons: 833 text_region_polygons.append(page_non_text_region_polygon) 834 grouped_char_polygons.append(tuple()) 835 836 text_region_flattener = TextRegionFlattener( 837 typical_long_side_ratio_min=typical_long_side_ratio_min, 838 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 839 image=page_image, 840 text_region_polygons=text_region_polygons, 841 grouped_char_polygons=grouped_char_polygons, 842 is_training=True, 843 ) 844 845 # Resize positive ftr. 846 positive_flattened_text_regions: List[FlattenedTextRegion] = [] 847 # For negative sampling. 848 positive_reference_heights: List[float] = [] 849 positive_reference_widths: List[float] = [] 850 num_negative_flattened_text_regions = 0 851 852 for flattened_text_region in text_region_flattener.flattened_text_regions: 853 if not flattened_text_region.flattened_char_polygons: 854 num_negative_flattened_text_regions += 1 855 continue 856 857 char_height_median = flattened_text_region.get_char_height_meidan() 858 859 text_region_resize_char_height_median = int( 860 rng.integers( 861 self.config.text_region_resize_char_height_median_min, 862 self.config.text_region_resize_char_height_median_max + 1, 863 ) 864 ) 865 scale = text_region_resize_char_height_median / char_height_median 866 867 height, width = flattened_text_region.shape 868 resized_height = round(height * scale) 869 resized_width = round(width * scale) 870 871 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 872 resized_height=resized_height, 873 resized_width=resized_width, 874 ) 875 876 positive_reference_heights.append(resized_height) 877 positive_reference_widths.append(resized_width) 878 879 # Post rotate. 880 post_rotate_angle = 0 881 if flattened_text_region.is_typical: 882 if rng.random() < self.config.text_region_typical_post_rotate_prob: 883 # Upside down only. 884 post_rotate_angle = 180 885 else: 886 if rng.random() < self.config.text_region_untypical_post_rotate_prob: 887 # 3-way rotate. 888 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 889 890 if post_rotate_angle != 0: 891 flattened_text_region = \ 892 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 893 894 positive_flattened_text_regions.append(flattened_text_region) 895 896 # Resize negative ftr. 897 negative_reference_heights = list( 898 rng_choice_with_size( 899 rng, 900 positive_reference_heights, 901 size=num_negative_flattened_text_regions, 902 replace=(num_negative_flattened_text_regions > len(positive_reference_heights)), 903 ) 904 ) 905 906 negative_height_max = max(positive_reference_heights) 907 negative_width_max = max(positive_reference_widths) 908 909 negative_flattened_text_regions: List[FlattenedTextRegion] = [] 910 911 for flattened_text_region in text_region_flattener.flattened_text_regions: 912 if flattened_text_region.flattened_char_polygons: 913 continue 914 915 reference_height = negative_reference_heights.pop() 916 scale = reference_height / flattened_text_region.height 917 918 height, width = flattened_text_region.shape 919 resized_height = round(height * scale) 920 resized_width = round(width * scale) 921 922 # Remove negative region that is too large. 923 if resized_height > negative_height_max or resized_width > negative_width_max: 924 continue 925 926 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 927 resized_height=resized_height, 928 resized_width=resized_width, 929 ) 930 931 # Post rotate. 932 post_rotate_angle = 0 933 if flattened_text_region.is_typical: 934 if rng.random() < self.config.text_region_typical_post_rotate_prob: 935 # Upside down only. 936 post_rotate_angle = 180 937 else: 938 if rng.random() < self.config.text_region_untypical_post_rotate_prob: 939 # 3-way rotate. 940 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 941 942 if post_rotate_angle != 0: 943 flattened_text_region = \ 944 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 945 946 negative_flattened_text_regions.append(flattened_text_region) 947 948 flattened_text_regions = ( 949 *positive_flattened_text_regions, 950 *negative_flattened_text_regions, 951 ) 952 return flattened_text_regions
954 def run(self, input: PageTextRegionStepInput, rng: RandomGenerator): 955 page_distortion_step_output = input.page_distortion_step_output 956 page_image = page_distortion_step_output.page_image 957 page_char_polygon_collection = page_distortion_step_output.page_char_polygon_collection 958 page_disconnected_text_region_collection = \ 959 page_distortion_step_output.page_disconnected_text_region_collection 960 page_non_text_region_collection = \ 961 page_distortion_step_output.page_non_text_region_collection 962 963 page_resizing_step_output = input.page_resizing_step_output 964 page_resized_text_line_mask = page_resizing_step_output.page_text_line_mask 965 966 debug = None 967 if self.config.enable_debug: 968 debug = PageTextRegionStepDebug() 969 970 # Build R-tree to track text regions. 971 # https://github.com/shapely/shapely/issues/640 972 id_to_disconnected_text_region_polygon: Dict[int, Polygon] = {} 973 disconnected_text_region_shapely_polygons: List[ShapelyPolygon] = [] 974 975 for polygon in page_disconnected_text_region_collection.to_polygons(): 976 shapely_polygon = polygon.to_shapely_polygon() 977 id_to_disconnected_text_region_polygon[id(shapely_polygon)] = polygon 978 disconnected_text_region_shapely_polygons.append(shapely_polygon) 979 980 disconnected_text_region_tree = STRtree(disconnected_text_region_shapely_polygons) 981 982 # Get the precise text regions. 983 precise_text_region_candidate_polygons: List[Polygon] = [] 984 for resized_precise_polygon in page_resized_text_line_mask.to_disconnected_polygons(): 985 # Resize back to the shape after distortion. 986 precise_polygon = resized_precise_polygon.to_conducted_resized_polygon( 987 page_resized_text_line_mask, 988 resized_height=page_image.height, 989 resized_width=page_image.width, 990 ) 991 992 # Find and extract intersected text region. 993 # NOTE: One precise_polygon could be overlapped with 994 # more than one disconnected_text_region_polygon! 995 for _, _, disconnected_text_region_mask, precise_mask, _ in \ 996 self.strtree_query_intersected_polygons( 997 strtree=disconnected_text_region_tree, 998 id_to_anchor_polygon=id_to_disconnected_text_region_polygon, 999 candidate_polygon=precise_polygon, 1000 ): 1001 precise_text_region_candidate_polygons.extend( 1002 self.generate_precise_text_region_candidate_polygons( 1003 precise_mask=precise_mask, 1004 disconnected_text_region_mask=disconnected_text_region_mask, 1005 ) 1006 ) 1007 1008 if debug: 1009 debug.page_image = page_image 1010 debug.precise_text_region_candidate_polygons = precise_text_region_candidate_polygons 1011 1012 # Help gc. 1013 del id_to_disconnected_text_region_polygon 1014 del disconnected_text_region_shapely_polygons 1015 del disconnected_text_region_tree 1016 1017 # Bind char-level polygon to precise text region. 1018 id_to_precise_text_region_polygon: Dict[int, Polygon] = {} 1019 precise_text_region_shapely_polygons: List[ShapelyPolygon] = [] 1020 1021 for polygon in precise_text_region_candidate_polygons: 1022 shapely_polygon = polygon.to_shapely_polygon() 1023 id_to_precise_text_region_polygon[id(shapely_polygon)] = polygon 1024 precise_text_region_shapely_polygons.append(shapely_polygon) 1025 1026 precise_text_region_tree = STRtree(precise_text_region_shapely_polygons) 1027 1028 id_to_char_polygons: DefaultDict[int, List[Polygon]] = defaultdict(list) 1029 for char_polygon in page_char_polygon_collection.polygons: 1030 best_precise_text_region_id = None 1031 intersected_ratio_max = 0 1032 1033 for ( 1034 precise_text_region_id, 1035 _, 1036 _, 1037 _, 1038 intersected_ratio, 1039 ) in self.strtree_query_intersected_polygons( 1040 strtree=precise_text_region_tree, 1041 id_to_anchor_polygon=id_to_precise_text_region_polygon, 1042 candidate_polygon=char_polygon, 1043 ): 1044 if intersected_ratio > intersected_ratio_max: 1045 intersected_ratio_max = intersected_ratio 1046 best_precise_text_region_id = precise_text_region_id 1047 1048 if best_precise_text_region_id is not None: 1049 id_to_char_polygons[best_precise_text_region_id].append(char_polygon) 1050 else: 1051 # NOTE: Text line with only a small char (i.e. delimiter) could enter this branch. 1052 # In such case, the text line bounding box is smaller than the char polygon, since 1053 # the leading/trailing char paddings are ignored during text line rendering. 1054 # It's acceptable for now since: 1) this case happens rarely, 2) and it won't 1055 # introduce labeling noise. 1056 logger.warning(f'Cannot assign a text region for char_polygon={char_polygon}') 1057 1058 page_text_region_infos: List[PageTextRegionInfo] = [] 1059 for precise_text_region_shapely_polygon in precise_text_region_shapely_polygons: 1060 ptrsp_id = id(precise_text_region_shapely_polygon) 1061 if ptrsp_id not in id_to_char_polygons: 1062 # Not related to any char polygons. 1063 continue 1064 assert id_to_char_polygons[ptrsp_id] 1065 page_text_region_infos.append( 1066 PageTextRegionInfo( 1067 precise_text_region_polygon=id_to_precise_text_region_polygon[ptrsp_id], 1068 char_polygons=id_to_char_polygons[ptrsp_id], 1069 ) 1070 ) 1071 1072 # Help gc. 1073 del id_to_precise_text_region_polygon 1074 del precise_text_region_shapely_polygons 1075 del precise_text_region_tree 1076 1077 if debug: 1078 debug.page_text_region_infos = page_text_region_infos 1079 1080 # Negative sampling. 1081 page_non_text_region_polygons = self.sample_page_non_text_region_polygons( 1082 page_non_text_region_polygons=tuple(page_non_text_region_collection.to_polygons()), 1083 num_page_text_region_infos=len(page_text_region_infos), 1084 rng=rng, 1085 ) 1086 1087 flattened_text_regions = self.build_flattened_text_regions( 1088 page_image=page_image, 1089 page_text_region_infos=page_text_region_infos, 1090 page_non_text_region_polygons=page_non_text_region_polygons, 1091 rng=rng, 1092 ) 1093 if debug: 1094 debug.flattened_text_regions = flattened_text_regions 1095 1096 # Stack text regions. 1097 image, _, char_polygons = stack_flattened_text_regions( 1098 page_pad=0, 1099 flattened_text_regions_pad=self.config.stack_flattened_text_regions_pad, 1100 flattened_text_regions=flattened_text_regions, 1101 ) 1102 1103 # Post uniform rotation. 1104 shape_before_rotate = image.shape 1105 rotate_angle = 0 1106 1107 if self.config.enable_post_rotate: 1108 rotate_angle = int( 1109 rng.integers( 1110 self.config.post_rotate_angle_min, 1111 self.config.post_rotate_angle_max + 1, 1112 ) 1113 ) 1114 rotated_result = rotate.distort( 1115 {'angle': rotate_angle}, 1116 image=image, 1117 polygons=char_polygons, 1118 ) 1119 assert rotated_result.image and rotated_result.polygons 1120 image = rotated_result.image 1121 char_polygons = rotated_result.polygons 1122 1123 return PageTextRegionStepOutput( 1124 page_image=image, 1125 page_char_polygons=char_polygons, 1126 shape_before_rotate=shape_before_rotate, 1127 rotate_angle=rotate_angle, 1128 debug=debug, 1129 )