vkit.pipeline.text_detection.page_text_region
1# Copyright 2022 vkit-x Administrator. All Rights Reserved. 2# 3# This project (vkit-x/vkit) is dual-licensed under commercial and SSPL licenses. 4# 5# The commercial license gives you the full rights to create and distribute software 6# on your own terms without any SSPL license obligations. For more information, 7# please see the "LICENSE_COMMERCIAL.txt" file. 8# 9# This project is also available under Server Side Public License (SSPL). 10# The SSPL licensing is ideal for use cases such as open source projects with 11# SSPL distribution, student/academic purposes, hobby projects, internal research 12# projects without external distribution, or other projects where all SSPL 13# obligations can be met. For more information, please see the "LICENSE_SSPL.txt" file. 14from typing import cast, List, Optional, DefaultDict, Sequence, Tuple 15from collections import defaultdict 16import itertools 17import math 18import statistics 19import logging 20 21import attrs 22from numpy.random import Generator as RandomGenerator 23import numpy as np 24from sklearn.neighbors import KDTree 25from shapely.strtree import STRtree 26from shapely.geometry import Polygon as ShapelyPolygon 27from rectpack import newPacker as RectPacker 28 29from vkit.utility import rng_choice, rng_choice_with_size 30from vkit.element import PointList, Box, Polygon, Mask, Image, ElementSetOperationMode 31from vkit.mechanism.distortion import rotate 32from ..interface import PipelineStep, PipelineStepFactory 33from .page_distortion import PageDistortionStepOutput 34from .page_resizing import PageResizingStepOutput 35 36logger = logging.getLogger(__name__) 37 38 39@attrs.define 40class PageTextRegionStepConfig: 41 use_adjusted_char_polygons: bool = False 42 prob_drop_single_char_page_text_region_info: float = 0.5 43 text_region_flattener_typical_long_side_ratio_min: float = 3.0 44 text_region_flattener_text_region_polygon_dilate_ratio_min: float = 0.85 45 text_region_flattener_text_region_polygon_dilate_ratio_max: float = 1.0 46 text_region_resize_char_height_median_min: int = 32 47 text_region_resize_char_height_median_max: int = 46 48 prob_text_region_typical_post_rotate: float = 0.2 49 prob_text_region_untypical_post_rotate: float = 0.2 50 negative_text_region_ratio: float = 0.1 51 prob_negative_text_region_post_rotate: float = 0.2 52 stack_flattened_text_regions_pad: int = 2 53 prob_post_rotate_90_angle: float = 0.5 54 prob_post_rotate_random_angle: float = 0.0 55 post_rotate_random_angle_min: int = -5 56 post_rotate_random_angle_max: int = 5 57 enable_debug: bool = False 58 59 60@attrs.define 61class PageTextRegionStepInput: 62 page_distortion_step_output: PageDistortionStepOutput 63 page_resizing_step_output: PageResizingStepOutput 64 65 66@attrs.define 67class PageTextRegionInfo: 68 precise_text_region_polygon: Polygon 69 char_polygons: Sequence[Polygon] 70 71 72@attrs.define 73class FlattenedTextRegion: 74 is_typical: bool 75 text_region_polygon: Polygon 76 text_region_image: Image 77 bounding_extended_text_region_mask: Mask 78 flattening_rotate_angle: int 79 shape_before_trim: Tuple[int, int] 80 rotated_trimmed_box: Box 81 shape_before_resize: Tuple[int, int] 82 post_rotate_angle: int 83 flattened_image: Image 84 flattened_mask: Mask 85 flattened_char_polygons: Optional[Sequence[Polygon]] 86 87 @property 88 def shape(self): 89 return self.flattened_image.shape 90 91 @property 92 def height(self): 93 return self.flattened_image.height 94 95 @property 96 def width(self): 97 return self.flattened_image.width 98 99 @property 100 def area(self): 101 return self.flattened_image.area 102 103 def get_char_height_meidan(self): 104 assert self.flattened_char_polygons 105 return statistics.median( 106 char_polygon.get_rectangular_height() for char_polygon in self.flattened_char_polygons 107 ) 108 109 def to_resized_flattened_text_region( 110 self, 111 resized_height: Optional[int] = None, 112 resized_width: Optional[int] = None, 113 ): 114 resized_flattened_image = self.flattened_image.to_resized_image( 115 resized_height=resized_height, 116 resized_width=resized_width, 117 ) 118 119 resized_flattened_mask = self.flattened_mask.to_resized_mask( 120 resized_height=resized_height, 121 resized_width=resized_width, 122 ) 123 124 resized_flattened_char_polygons = None 125 if self.flattened_char_polygons is not None: 126 resized_flattened_char_polygons = [ 127 flattened_char_polygon.to_conducted_resized_polygon( 128 self.shape, 129 resized_height=resized_height, 130 resized_width=resized_width, 131 ) for flattened_char_polygon in self.flattened_char_polygons 132 ] 133 134 return attrs.evolve( 135 self, 136 flattened_image=resized_flattened_image, 137 flattened_mask=resized_flattened_mask, 138 flattened_char_polygons=resized_flattened_char_polygons, 139 ) 140 141 def to_post_rotated_flattened_text_region( 142 self, 143 post_rotate_angle: int, 144 ): 145 assert self.post_rotate_angle == 0 146 147 # NOTE: No need to trim. 148 rotated_result = rotate.distort( 149 {'angle': post_rotate_angle}, 150 image=self.flattened_image, 151 mask=self.flattened_mask, 152 polygons=self.flattened_char_polygons, 153 ) 154 rotated_flattened_image = rotated_result.image 155 assert rotated_flattened_image 156 rotated_flattened_mask = rotated_result.mask 157 assert rotated_flattened_mask 158 rotated_flattened_char_polygons = rotated_result.polygons 159 160 return attrs.evolve( 161 self, 162 post_rotate_angle=post_rotate_angle, 163 flattened_image=rotated_flattened_image, 164 flattened_mask=rotated_flattened_mask, 165 flattened_char_polygons=rotated_flattened_char_polygons, 166 ) 167 168 169@attrs.define 170class PageTextRegionStepDebug: 171 page_image: Image = attrs.field(default=None) 172 precise_text_region_candidate_polygons: Sequence[Polygon] = attrs.field(default=None) 173 page_text_region_infos: Sequence[PageTextRegionInfo] = attrs.field(default=None) 174 flattened_text_regions: Sequence[FlattenedTextRegion] = attrs.field(default=None) 175 176 177@attrs.define 178class PageTextRegionStepOutput: 179 page_image: Image 180 page_active_mask: Mask 181 page_char_polygons: Sequence[Polygon] 182 page_text_region_polygons: Sequence[Polygon] 183 page_char_polygon_text_region_polygon_indices: Sequence[int] 184 shape_before_rotate: Tuple[int, int] 185 rotate_angle: int 186 debug: Optional[PageTextRegionStepDebug] 187 188 189def calculate_boxed_masks_intersected_ratio( 190 anchor_mask: Mask, 191 candidate_mask: Mask, 192 use_candidate_as_base: bool = False, 193): 194 anchor_box = anchor_mask.box 195 assert anchor_box 196 197 candidate_box = candidate_mask.box 198 assert candidate_box 199 200 # Calculate intersection. 201 up = max(anchor_box.up, candidate_box.up) 202 down = min(anchor_box.down, candidate_box.down) 203 left = max(anchor_box.left, candidate_box.left) 204 right = min(anchor_box.right, candidate_box.right) 205 206 if up > down or left > right: 207 return 0.0 208 209 np_intersected_anchor_mask = anchor_mask.mat[ 210 up - anchor_box.up:down - anchor_box.up + 1, 211 left - anchor_box.left:right - anchor_box.left + 1, 212 ] # yapf: disable 213 np_intersected_candidate_mask = candidate_mask.mat[ 214 up - candidate_box.up:down - candidate_box.up + 1, 215 left - candidate_box.left:right - candidate_box.left + 1, 216 ] # yapf: disable 217 np_intersected_mask = np_intersected_anchor_mask & np_intersected_candidate_mask 218 intersected_area = int(np_intersected_mask.sum()) 219 220 if use_candidate_as_base: 221 base_area = int(candidate_mask.np_mask.sum()) 222 else: 223 base_area = ( 224 int(anchor_mask.np_mask.sum()) + int(candidate_mask.np_mask.sum()) - intersected_area 225 ) 226 227 return intersected_area / base_area 228 229 230class TextRegionFlattener: 231 232 @classmethod 233 def patch_text_region_polygons( 234 cls, 235 text_region_polygons: Sequence[Polygon], 236 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 237 ): 238 if grouped_char_polygons is None: 239 return text_region_polygons 240 241 assert len(text_region_polygons) == len(grouped_char_polygons) 242 243 patched_text_region_polygons: List[Polygon] = [] 244 for text_region_polygon, char_polygons in zip(text_region_polygons, grouped_char_polygons): 245 # Need to make sure all char polygons are included. 246 unionized_polygons = [text_region_polygon] 247 unionized_polygons.extend(char_polygons) 248 249 bounding_box = Box.from_boxes((polygon.bounding_box for polygon in unionized_polygons)) 250 mask = Mask.from_shapable(bounding_box).to_box_attached(bounding_box) 251 for polygon in unionized_polygons: 252 polygon.fill_mask(mask) 253 254 patched_text_region_polygons.append(mask.to_external_polygon()) 255 256 return patched_text_region_polygons 257 258 @classmethod 259 def get_dilated_and_bounding_rectangular_polygons( 260 cls, 261 text_region_polygon_dilate_ratio: float, 262 shape: Tuple[int, int], 263 text_region_polygons: Sequence[Polygon], 264 force_no_dilation_flags: Optional[Sequence[bool]] = None, 265 ): 266 dilated_text_region_polygons: List[Polygon] = [] 267 bounding_rectangular_polygons: List[Polygon] = [] 268 269 if force_no_dilation_flags is None: 270 force_no_dilation_flags_iter = itertools.repeat(False) 271 else: 272 assert len(force_no_dilation_flags) == len(text_region_polygons) 273 force_no_dilation_flags_iter = force_no_dilation_flags 274 275 for text_region_polygon, force_no_dilation_flag in zip( 276 text_region_polygons, force_no_dilation_flags_iter 277 ): 278 279 if not force_no_dilation_flag: 280 # Dilate. 281 text_region_polygon = text_region_polygon.to_dilated_polygon( 282 ratio=text_region_polygon_dilate_ratio, 283 ) 284 text_region_polygon = text_region_polygon.to_clipped_polygon(shape) 285 286 dilated_text_region_polygons.append(text_region_polygon) 287 bounding_rectangular_polygons.append( 288 text_region_polygon.to_bounding_rectangular_polygon(shape) 289 ) 290 291 return dilated_text_region_polygons, bounding_rectangular_polygons 292 293 @classmethod 294 def analyze_bounding_rectangular_polygons( 295 cls, 296 bounding_rectangular_polygons: Sequence[Polygon], 297 ): 298 short_side_lengths: List[float] = [] 299 long_side_ratios: List[float] = [] 300 long_side_angles: List[int] = [] 301 302 for polygon in bounding_rectangular_polygons: 303 # Get reference line. 304 point0, point1, _, point3 = polygon.points 305 side0_length = math.hypot( 306 point0.smooth_y - point1.smooth_y, 307 point0.smooth_x - point1.smooth_x, 308 ) 309 side1_length = math.hypot( 310 point0.smooth_y - point3.smooth_y, 311 point0.smooth_x - point3.smooth_x, 312 ) 313 314 # Get the short side length. 315 short_side_lengths.append(min(side0_length, side1_length)) 316 317 long_side_ratios.append( 318 max(side0_length, side1_length) / min(side0_length, side1_length) 319 ) 320 321 point_a = point0 322 if side0_length > side1_length: 323 # Reference line (p0 -> p1). 324 point_b = point1 325 else: 326 # Reference line (p0 -> p3). 327 point_b = point3 328 329 # Get the angle of reference line, in [0, 180) degree. 330 np_theta = np.arctan2( 331 point_a.smooth_y - point_b.smooth_y, 332 point_a.smooth_x - point_b.smooth_x, 333 ) 334 np_theta = np_theta % np.pi 335 long_side_angle = round(np_theta / np.pi * 180) % 180 336 long_side_angles.append(long_side_angle) 337 338 return short_side_lengths, long_side_ratios, long_side_angles 339 340 @classmethod 341 def get_typical_indices( 342 cls, 343 typical_long_side_ratio_min: float, 344 long_side_ratios: Sequence[float], 345 ): 346 return tuple( 347 idx for idx, long_side_ratio in enumerate(long_side_ratios) 348 if long_side_ratio >= typical_long_side_ratio_min 349 ) 350 351 @classmethod 352 def check_first_text_region_polygon_is_larger( 353 cls, 354 text_region_polygons: Sequence[Polygon], 355 short_side_lengths: Sequence[float], 356 first_idx: int, 357 second_idx: int, 358 ): 359 first_text_region_polygon = text_region_polygons[first_idx] 360 second_text_region_polygon = text_region_polygons[second_idx] 361 362 # The short side indicates the text line height. 363 first_short_side_length = short_side_lengths[first_idx] 364 second_short_side_length = short_side_lengths[second_idx] 365 366 return ( 367 first_text_region_polygon.area >= second_text_region_polygon.area 368 and first_short_side_length >= second_short_side_length 369 ) 370 371 @classmethod 372 def get_main_and_flattening_rotate_angles( 373 cls, 374 text_region_polygons: Sequence[Polygon], 375 typical_indices: Sequence[int], 376 short_side_lengths: Sequence[float], 377 long_side_angles: Sequence[int], 378 ): 379 typical_indices_set = set(typical_indices) 380 text_region_center_points = [ 381 text_region_polygon.get_center_point() for text_region_polygon in text_region_polygons 382 ] 383 384 main_angles: List[Optional[int]] = [None] * len(long_side_angles) 385 386 # 1. For typical indices, or if no typical indices. 387 for idx, long_side_angle in enumerate(long_side_angles): 388 if not typical_indices_set or idx in typical_indices_set: 389 main_angles[idx] = long_side_angle 390 391 # 2. For nontypcial indices. 392 if typical_indices_set: 393 typical_center_points = PointList( 394 text_region_center_points[idx] for idx in typical_indices 395 ) 396 kd_tree = KDTree(typical_center_points.to_np_array()) 397 398 nontypical_indices = tuple( 399 idx for idx, _ in enumerate(long_side_angles) if idx not in typical_indices_set 400 ) 401 nontypical_center_points = PointList( 402 text_region_center_points[idx] for idx in nontypical_indices 403 ) 404 405 # Set main angle as the closest typical angle. 406 # Round 1: Set if the closest typical polygon is large enough. 407 _, np_kd_nbr_indices = kd_tree.query(nontypical_center_points.to_np_array()) 408 round2_nontypical_indices: List[int] = [] 409 for nontypical_idx, typical_indices_idx in zip( 410 nontypical_indices, 411 np_kd_nbr_indices[:, 0].tolist(), 412 ): 413 typical_idx = typical_indices[typical_indices_idx] 414 if cls.check_first_text_region_polygon_is_larger( 415 text_region_polygons=text_region_polygons, 416 short_side_lengths=short_side_lengths, 417 first_idx=typical_idx, 418 second_idx=nontypical_idx, 419 ): 420 main_angles[nontypical_idx] = main_angles[typical_idx] 421 else: 422 round2_nontypical_indices.append(nontypical_idx) 423 424 # Round 2: Searching the closest typical polygon that has larger area. 425 round3_nontypical_indices: List[int] = [] 426 if round2_nontypical_indices: 427 round2_nontypical_center_points = PointList( 428 text_region_center_points[idx] for idx in round2_nontypical_indices 429 ) 430 _, np_kd_nbr_indices = kd_tree.query( 431 round2_nontypical_center_points.to_np_array(), 432 k=len(typical_center_points), 433 ) 434 for nontypical_idx, typical_indices_indices in zip( 435 round2_nontypical_indices, 436 np_kd_nbr_indices.tolist(), 437 ): 438 hit_typical_idx = None 439 for typical_indices_idx in typical_indices_indices: 440 typical_idx = typical_indices[typical_indices_idx] 441 if cls.check_first_text_region_polygon_is_larger( 442 text_region_polygons=text_region_polygons, 443 short_side_lengths=short_side_lengths, 444 first_idx=typical_idx, 445 second_idx=nontypical_idx, 446 ): 447 hit_typical_idx = typical_idx 448 break 449 450 if hit_typical_idx is not None: 451 main_angles[nontypical_idx] = main_angles[hit_typical_idx] 452 else: 453 round3_nontypical_indices.append(nontypical_idx) 454 455 # Round 3: Last resort. Set to the median of typical angles. 456 if round3_nontypical_indices: 457 main_angles_median = statistics.median_low( 458 long_side_angles[typical_idx] for typical_idx in typical_indices 459 ) 460 for nontypical_idx in round3_nontypical_indices: 461 main_angles[nontypical_idx] = main_angles_median 462 463 # 3. Get angle for flattening. 464 flattening_rotate_angles: List[int] = [] 465 for main_angle in main_angles: 466 assert main_angle is not None 467 if main_angle <= 90: 468 # [270, 360). 469 flattening_rotate_angle = (360 - main_angle) % 360 470 else: 471 # [1, 90). 472 flattening_rotate_angle = 180 - main_angle 473 flattening_rotate_angles.append(flattening_rotate_angle) 474 475 return cast(List[int], main_angles), flattening_rotate_angles 476 477 @classmethod 478 def get_bounding_extended_text_region_masks( 479 cls, 480 shape: Tuple[int, int], 481 text_region_polygons: Sequence[Polygon], 482 dilated_text_region_polygons: Sequence[Polygon], 483 bounding_rectangular_polygons: Sequence[Polygon], 484 typical_indices: Sequence[int], 485 main_angles: Sequence[int], 486 ): 487 typical_indices_set = set(typical_indices) 488 489 text_mask = Mask.from_polygons(shape, text_region_polygons) 490 non_text_mask = text_mask.to_inverted_mask() 491 492 box = Box.from_shape(shape) 493 text_mask = text_mask.to_box_attached(box) 494 non_text_mask = non_text_mask.to_box_attached(box) 495 496 bounding_extended_text_region_masks: List[Mask] = [] 497 498 num_text_region_polygons = len(text_region_polygons) 499 for idx in range(num_text_region_polygons): 500 text_region_polygon = text_region_polygons[idx] 501 dilated_text_region_polygon = dilated_text_region_polygons[idx] 502 bounding_rectangular_polygon = bounding_rectangular_polygons[idx] 503 504 if typical_indices_set and idx not in typical_indices_set: 505 # Patch bounding rectangular polygon if is nontypical. 506 main_angle = main_angles[idx] 507 bounding_rectangular_polygon = \ 508 dilated_text_region_polygon.to_bounding_rectangular_polygon( 509 shape=shape, 510 angle=main_angle, 511 ) 512 513 # See the comment in Polygon.to_bounding_rectangular_polygon. 514 bounding_box = Box.from_boxes(( 515 dilated_text_region_polygon.bounding_box, 516 bounding_rectangular_polygon.bounding_box, 517 )) 518 519 # Fill other text region. 520 bounding_other_text_mask = \ 521 Mask.from_shapable(bounding_box).to_box_attached(bounding_box) 522 # Copy from text mask. 523 bounding_rectangular_polygon.fill_mask(bounding_other_text_mask, text_mask) 524 # Use the original text region polygon to unset the current text mask. 525 text_region_polygon.fill_mask(bounding_other_text_mask, 0) 526 527 # Fill protentially dilated text region. 528 bounding_text_mask = \ 529 Mask.from_shapable(bounding_other_text_mask).to_box_attached(bounding_box) 530 # Use the protentially dilated text region polygon to set the current text mask. 531 dilated_text_region_polygon.fill_mask(bounding_text_mask, value=1) 532 533 del dilated_text_region_polygon 534 535 # Trim protentially dilated text region polygon by eliminating other text region. 536 bounding_trimmed_text_mask = Mask.from_masks( 537 bounding_box, 538 [ 539 # Includes the protentially dilated text region. 540 bounding_text_mask, 541 # But not includes any other text regions. 542 bounding_other_text_mask.to_inverted_mask(), 543 ], 544 ElementSetOperationMode.INTERSECT, 545 ) 546 547 # Extract non-text region. 548 bounding_non_text_mask = bounding_rectangular_polygon.extract_mask(non_text_mask) 549 550 # Unionize trimmed text region and non-text region. 551 bounding_extended_text_region_mask = Mask.from_masks( 552 bounding_box, 553 [bounding_trimmed_text_mask, bounding_non_text_mask], 554 ) 555 556 bounding_extended_text_region_masks.append(bounding_extended_text_region_mask) 557 558 return bounding_extended_text_region_masks 559 560 @classmethod 561 def build_flattened_text_regions( 562 cls, 563 image: Image, 564 text_region_polygons: Sequence[Polygon], 565 bounding_extended_text_region_masks: Sequence[Mask], 566 typical_indices: Sequence[int], 567 flattening_rotate_angles: Sequence[int], 568 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 569 ): 570 typical_indices_set = set(typical_indices) 571 572 flattened_text_regions: List[FlattenedTextRegion] = [] 573 574 for idx, ( 575 text_region_polygon, 576 bounding_extended_text_region_mask, 577 flattening_rotate_angle, 578 ) in enumerate( 579 zip( 580 text_region_polygons, 581 bounding_extended_text_region_masks, 582 flattening_rotate_angles, 583 ) 584 ): 585 bounding_box = bounding_extended_text_region_mask.box 586 assert bounding_box 587 588 # Extract image. 589 text_region_image = bounding_extended_text_region_mask.extract_image(image) 590 591 # Shift char polygons. 592 relative_char_polygons = None 593 if grouped_char_polygons is not None: 594 char_polygons = grouped_char_polygons[idx] 595 relative_char_polygons = [ 596 char_polygon.to_relative_polygon( 597 origin_y=bounding_box.up, 598 origin_x=bounding_box.left, 599 ) for char_polygon in char_polygons 600 ] 601 602 # Rotate. 603 rotated_result = rotate.distort( 604 {'angle': flattening_rotate_angle}, 605 image=text_region_image, 606 mask=bounding_extended_text_region_mask, 607 polygons=relative_char_polygons, 608 ) 609 rotated_text_region_image = rotated_result.image 610 assert rotated_text_region_image 611 rotated_bounding_extended_text_region_mask = rotated_result.mask 612 assert rotated_bounding_extended_text_region_mask 613 # Could be None. 614 rotated_char_polygons = rotated_result.polygons 615 616 # Trim. 617 rotated_trimmed_box = rotated_bounding_extended_text_region_mask.to_external_box() 618 619 trimmed_text_region_image = rotated_text_region_image.to_cropped_image( 620 up=rotated_trimmed_box.up, 621 down=rotated_trimmed_box.down, 622 left=rotated_trimmed_box.left, 623 right=rotated_trimmed_box.right, 624 ) 625 626 trimmed_mask = rotated_trimmed_box.extract_mask( 627 rotated_bounding_extended_text_region_mask 628 ) 629 630 trimmed_char_polygons = None 631 if rotated_char_polygons: 632 trimmed_char_polygons = [ 633 rotated_char_polygon.to_relative_polygon( 634 origin_y=rotated_trimmed_box.up, 635 origin_x=rotated_trimmed_box.left, 636 ) for rotated_char_polygon in rotated_char_polygons 637 ] 638 639 flattened_text_regions.append( 640 FlattenedTextRegion( 641 is_typical=(idx in typical_indices_set), 642 text_region_polygon=text_region_polygon, 643 text_region_image=bounding_extended_text_region_mask.extract_image(image), 644 bounding_extended_text_region_mask=bounding_extended_text_region_mask, 645 flattening_rotate_angle=flattening_rotate_angle, 646 shape_before_trim=rotated_text_region_image.shape, 647 rotated_trimmed_box=rotated_trimmed_box, 648 shape_before_resize=trimmed_text_region_image.shape, 649 post_rotate_angle=0, 650 flattened_image=trimmed_text_region_image, 651 flattened_mask=trimmed_mask, 652 flattened_char_polygons=trimmed_char_polygons, 653 ) 654 ) 655 656 return flattened_text_regions 657 658 def __init__( 659 self, 660 typical_long_side_ratio_min: float, 661 text_region_polygon_dilate_ratio: float, 662 image: Image, 663 text_region_polygons: Sequence[Polygon], 664 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]] = None, 665 is_training: bool = False, 666 ): 667 self.original_text_region_polygons = text_region_polygons 668 669 self.text_region_polygons = self.patch_text_region_polygons( 670 text_region_polygons=text_region_polygons, 671 grouped_char_polygons=grouped_char_polygons, 672 ) 673 674 force_no_dilation_flags = None 675 if is_training: 676 assert grouped_char_polygons and len(text_region_polygons) == len(grouped_char_polygons) 677 force_no_dilation_flags = [] 678 for char_polygons in grouped_char_polygons: 679 force_no_dilation_flags.append(not char_polygons) 680 681 ( 682 self.dilated_text_region_polygons, 683 self.bounding_rectangular_polygons, 684 ) = self.get_dilated_and_bounding_rectangular_polygons( 685 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 686 shape=image.shape, 687 text_region_polygons=self.text_region_polygons, 688 force_no_dilation_flags=force_no_dilation_flags, 689 ) 690 691 ( 692 self.short_side_lengths, 693 self.long_side_ratios, 694 self.long_side_angles, 695 ) = self.analyze_bounding_rectangular_polygons(self.bounding_rectangular_polygons) 696 697 self.typical_indices = self.get_typical_indices( 698 typical_long_side_ratio_min=typical_long_side_ratio_min, 699 long_side_ratios=self.long_side_ratios, 700 ) 701 702 ( 703 self.main_angles, 704 self.flattening_rotate_angles, 705 ) = self.get_main_and_flattening_rotate_angles( 706 text_region_polygons=self.text_region_polygons, 707 typical_indices=self.typical_indices, 708 short_side_lengths=self.short_side_lengths, 709 long_side_angles=self.long_side_angles, 710 ) 711 712 self.bounding_extended_text_region_masks = self.get_bounding_extended_text_region_masks( 713 shape=image.shape, 714 text_region_polygons=self.text_region_polygons, 715 dilated_text_region_polygons=self.dilated_text_region_polygons, 716 bounding_rectangular_polygons=self.bounding_rectangular_polygons, 717 typical_indices=self.typical_indices, 718 main_angles=self.main_angles, 719 ) 720 721 self.flattened_text_regions = self.build_flattened_text_regions( 722 image=image, 723 # NOTE: need to use the original text region polygons for reversed opts. 724 text_region_polygons=self.original_text_region_polygons, 725 bounding_extended_text_region_masks=self.bounding_extended_text_region_masks, 726 typical_indices=self.typical_indices, 727 flattening_rotate_angles=self.flattening_rotate_angles, 728 grouped_char_polygons=grouped_char_polygons, 729 ) 730 731 732def build_background_image_for_stacking(height: int, width: int): 733 np_rgb_rows = [np.zeros((width, 3), dtype=np.uint8) for _ in range(3)] 734 rgb_tuples = [(255, 0, 0), (0, 255, 0), (0, 0, 255)] 735 736 for color_offset, np_row in enumerate(np_rgb_rows): 737 for color_idx in range(3): 738 color_tuple = rgb_tuples[(color_offset + color_idx) % 3] 739 np_row[color_idx::3] = color_tuple 740 741 np_image = np.zeros((height, width, 3), dtype=np.uint8) 742 for row_offset, np_row in enumerate(np_rgb_rows): 743 np_image[row_offset::3] = np_row 744 745 return Image(mat=np_image) 746 747 748def stack_flattened_text_regions( 749 page_pad: int, 750 flattened_text_regions_pad: int, 751 flattened_text_regions: Sequence[FlattenedTextRegion], 752): 753 page_double_pad = 2 * page_pad 754 flattened_text_regions_double_pad = 2 * flattened_text_regions_pad 755 756 rect_packer = RectPacker(rotation=False) 757 758 # Add box and bin. 759 # NOTE: Only one bin is added, that is, packing all text region into one image. 760 bin_width = 0 761 bin_height = 0 762 763 for ftr_idx, flattened_text_region in enumerate(flattened_text_regions): 764 rect_packer.add_rect( 765 width=flattened_text_region.width + flattened_text_regions_double_pad, 766 height=flattened_text_region.height + flattened_text_regions_double_pad, 767 rid=ftr_idx, 768 ) 769 770 bin_width = max(bin_width, flattened_text_region.width) 771 bin_height += flattened_text_region.height 772 773 bin_width += flattened_text_regions_double_pad 774 bin_height += flattened_text_regions_double_pad 775 776 rect_packer.add_bin(width=bin_width, height=bin_height) 777 778 # Pack boxes. 779 rect_packer.pack() # type: ignore 780 781 # Get packed boxes. 782 unordered_boxes: List[Box] = [] 783 ftr_indices: List[int] = [] 784 for bin_idx, x, y, width, height, ftr_idx in rect_packer.rect_list(): 785 assert bin_idx == 0 786 unordered_boxes.append(Box( 787 up=y, 788 down=y + height - 1, 789 left=x, 790 right=x + width - 1, 791 )) 792 ftr_indices.append(ftr_idx) 793 794 # Order boxes. 795 inverse_ftr_indices = [-1] * len(ftr_indices) 796 for inverse_ftr_idx, ftr_idx in enumerate(ftr_indices): 797 inverse_ftr_indices[ftr_idx] = inverse_ftr_idx 798 for inverse_ftr_idx in inverse_ftr_indices: 799 assert inverse_ftr_idx >= 0 800 padded_boxes = [unordered_boxes[inverse_ftr_idx] for inverse_ftr_idx in inverse_ftr_indices] 801 802 page_height = max(box.down for box in padded_boxes) + 1 + page_double_pad 803 page_width = max(box.right for box in padded_boxes) + 1 + page_double_pad 804 805 image = build_background_image_for_stacking(page_height, page_width) 806 active_mask = Mask.from_shapable(image) 807 text_region_boxes: List[Box] = [] 808 char_polygons: List[Polygon] = [] 809 char_polygon_text_region_box_indices: List[int] = [] 810 811 for padded_box, flattened_text_region in zip(padded_boxes, flattened_text_regions): 812 assert flattened_text_region.height + flattened_text_regions_double_pad \ 813 == padded_box.height 814 assert flattened_text_region.width + flattened_text_regions_double_pad \ 815 == padded_box.width 816 817 # Remove box padding. 818 up = padded_box.up + flattened_text_regions_pad + page_pad 819 left = padded_box.left + flattened_text_regions_pad + page_pad 820 821 text_region_box = Box( 822 up=up, 823 down=up + flattened_text_region.height - 1, 824 left=left, 825 right=left + flattened_text_region.width - 1, 826 ) 827 text_region_boxes.append(text_region_box) 828 text_region_box_idx = len(text_region_boxes) - 1 829 830 # Render. 831 text_region_box.fill_image( 832 image, 833 flattened_text_region.flattened_image, 834 image_mask=flattened_text_region.flattened_mask, 835 ) 836 text_region_box.fill_mask( 837 active_mask, 838 value=1, 839 mask_mask=flattened_text_region.flattened_mask, 840 ) 841 842 if flattened_text_region.flattened_char_polygons: 843 for char_polygon in flattened_text_region.flattened_char_polygons: 844 char_polygons.append(char_polygon.to_shifted_polygon( 845 offset_y=up, 846 offset_x=left, 847 )) 848 char_polygon_text_region_box_indices.append(text_region_box_idx) 849 850 return ( 851 image, 852 active_mask, 853 text_region_boxes, 854 char_polygons, 855 char_polygon_text_region_box_indices, 856 ) 857 858 859class PageTextRegionStep( 860 PipelineStep[ 861 PageTextRegionStepConfig, 862 PageTextRegionStepInput, 863 PageTextRegionStepOutput, 864 ] 865): # yapf: disable 866 867 @classmethod 868 def generate_precise_text_region_candidate_polygons( 869 cls, 870 precise_mask: Mask, 871 disconnected_text_region_mask: Mask, 872 ): 873 assert precise_mask.box and disconnected_text_region_mask.box 874 875 # Get the intersection. 876 intersected_box = Box( 877 up=max(precise_mask.box.up, disconnected_text_region_mask.box.up), 878 down=min(precise_mask.box.down, disconnected_text_region_mask.box.down), 879 left=max(precise_mask.box.left, disconnected_text_region_mask.box.left), 880 right=min(precise_mask.box.right, disconnected_text_region_mask.box.right), 881 ) 882 assert intersected_box.up <= intersected_box.down 883 assert intersected_box.left <= intersected_box.right 884 885 precise_mask = intersected_box.extract_mask(precise_mask) 886 disconnected_text_region_mask = intersected_box.extract_mask(disconnected_text_region_mask) 887 888 # Apply mask bitwise-and operation. 889 intersected_mask = Mask( 890 mat=(disconnected_text_region_mask.mat & precise_mask.mat).astype(np.uint8) 891 ) 892 intersected_mask = intersected_mask.to_box_attached(intersected_box) 893 894 # NOTE: 895 # 1. Could extract more than one polygons. 896 # 2. Some polygons are in border and should be removed later. 897 return intersected_mask.to_disconnected_polygons() 898 899 @classmethod 900 def strtree_query_intersected_polygons( 901 cls, 902 strtree: STRtree, 903 anchor_polygons: Sequence[Polygon], 904 candidate_polygon: Polygon, 905 ): 906 candidate_shapely_polygon = candidate_polygon.to_shapely_polygon() 907 candidate_mask = candidate_polygon.mask 908 909 for anchor_idx in sorted(strtree.query(candidate_shapely_polygon)): 910 anchor_polygon = anchor_polygons[anchor_idx] 911 anchor_mask = anchor_polygon.mask 912 913 intersected_ratio = calculate_boxed_masks_intersected_ratio( 914 anchor_mask=anchor_mask, 915 candidate_mask=candidate_mask, 916 use_candidate_as_base=True, 917 ) 918 919 yield ( 920 anchor_idx, 921 anchor_polygon, 922 anchor_mask, 923 candidate_mask, 924 intersected_ratio, 925 ) 926 927 def sample_page_non_text_region_polygons( 928 self, 929 page_non_text_region_polygons: Sequence[Polygon], 930 num_page_text_region_infos: int, 931 rng: RandomGenerator, 932 ): 933 negative_ratio = self.config.negative_text_region_ratio 934 num_page_non_text_region_polygons = round( 935 negative_ratio * num_page_text_region_infos / (1 - negative_ratio) 936 ) 937 return rng_choice_with_size( 938 rng, 939 page_non_text_region_polygons, 940 size=min( 941 num_page_non_text_region_polygons, 942 len(page_non_text_region_polygons), 943 ), 944 replace=False, 945 ) 946 947 def build_flattened_text_regions( 948 self, 949 page_image: Image, 950 page_text_region_infos: Sequence[PageTextRegionInfo], 951 page_non_text_region_polygons: Sequence[Polygon], 952 rng: RandomGenerator, 953 ): 954 text_region_polygon_dilate_ratio = float( 955 rng.uniform( 956 self.config.text_region_flattener_text_region_polygon_dilate_ratio_min, 957 self.config.text_region_flattener_text_region_polygon_dilate_ratio_max, 958 ) 959 ) 960 typical_long_side_ratio_min = \ 961 self.config.text_region_flattener_typical_long_side_ratio_min 962 963 text_region_polygons: List[Polygon] = [] 964 grouped_char_polygons: List[Sequence[Polygon]] = [] 965 for page_text_region_info in page_text_region_infos: 966 text_region_polygons.append(page_text_region_info.precise_text_region_polygon) 967 grouped_char_polygons.append(page_text_region_info.char_polygons) 968 969 # Inject nagative regions. 970 for page_non_text_region_polygon in page_non_text_region_polygons: 971 # NOTE: Don't drop any text region here, otherwise will introduce labeling confusion, 972 # since dropped text region will be considered as non-text region. 973 text_region_polygons.append(page_non_text_region_polygon) 974 grouped_char_polygons.append(tuple()) 975 976 text_region_flattener = TextRegionFlattener( 977 typical_long_side_ratio_min=typical_long_side_ratio_min, 978 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 979 image=page_image, 980 text_region_polygons=text_region_polygons, 981 grouped_char_polygons=grouped_char_polygons, 982 is_training=True, 983 ) 984 985 # Resize positive ftr. 986 positive_flattened_text_regions: List[FlattenedTextRegion] = [] 987 # For negative sampling. 988 positive_reference_heights: List[float] = [] 989 positive_reference_widths: List[float] = [] 990 num_negative_flattened_text_regions = 0 991 992 for flattened_text_region in text_region_flattener.flattened_text_regions: 993 if not flattened_text_region.flattened_char_polygons: 994 num_negative_flattened_text_regions += 1 995 continue 996 997 if len(flattened_text_region.flattened_char_polygons) == 1 \ 998 and rng.random() < self.config.prob_drop_single_char_page_text_region_info: 999 # Ignore some single-char text region for reducing label confusion. 1000 continue 1001 1002 char_height_median = flattened_text_region.get_char_height_meidan() 1003 1004 text_region_resize_char_height_median = int( 1005 rng.integers( 1006 self.config.text_region_resize_char_height_median_min, 1007 self.config.text_region_resize_char_height_median_max + 1, 1008 ) 1009 ) 1010 scale = text_region_resize_char_height_median / char_height_median 1011 1012 height, width = flattened_text_region.shape 1013 resized_height = round(height * scale) 1014 resized_width = round(width * scale) 1015 1016 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 1017 resized_height=resized_height, 1018 resized_width=resized_width, 1019 ) 1020 1021 positive_reference_heights.append(resized_height) 1022 positive_reference_widths.append(resized_width) 1023 1024 # Post rotate. 1025 post_rotate_angle = 0 1026 if flattened_text_region.is_typical: 1027 if rng.random() < self.config.prob_text_region_typical_post_rotate: 1028 # Upside down only. 1029 post_rotate_angle = 180 1030 else: 1031 if rng.random() < self.config.prob_text_region_untypical_post_rotate: 1032 # 3-way rotate. 1033 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 1034 1035 if post_rotate_angle != 0: 1036 flattened_text_region = \ 1037 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 1038 1039 positive_flattened_text_regions.append(flattened_text_region) 1040 1041 # Resize negative ftr. 1042 negative_reference_heights = list( 1043 rng_choice_with_size( 1044 rng, 1045 positive_reference_heights, 1046 size=num_negative_flattened_text_regions, 1047 replace=(num_negative_flattened_text_regions > len(positive_reference_heights)), 1048 ) 1049 ) 1050 1051 negative_height_max = max(positive_reference_heights) 1052 negative_width_max = max(positive_reference_widths) 1053 1054 negative_flattened_text_regions: List[FlattenedTextRegion] = [] 1055 1056 for flattened_text_region in text_region_flattener.flattened_text_regions: 1057 if flattened_text_region.flattened_char_polygons: 1058 continue 1059 1060 reference_height = negative_reference_heights.pop() 1061 scale = reference_height / flattened_text_region.height 1062 1063 height, width = flattened_text_region.shape 1064 resized_height = round(height * scale) 1065 resized_width = round(width * scale) 1066 1067 # Remove negative region that is too large. 1068 if resized_height > negative_height_max or resized_width > negative_width_max: 1069 continue 1070 1071 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 1072 resized_height=resized_height, 1073 resized_width=resized_width, 1074 ) 1075 1076 # Post rotate. 1077 post_rotate_angle = 0 1078 if flattened_text_region.is_typical: 1079 if rng.random() < self.config.prob_text_region_typical_post_rotate: 1080 # Upside down only. 1081 post_rotate_angle = 180 1082 else: 1083 if rng.random() < self.config.prob_text_region_untypical_post_rotate: 1084 # 3-way rotate. 1085 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 1086 1087 if post_rotate_angle != 0: 1088 flattened_text_region = \ 1089 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 1090 1091 negative_flattened_text_regions.append(flattened_text_region) 1092 1093 flattened_text_regions = ( 1094 *positive_flattened_text_regions, 1095 *negative_flattened_text_regions, 1096 ) 1097 return flattened_text_regions 1098 1099 def run(self, input: PageTextRegionStepInput, rng: RandomGenerator): 1100 page_distortion_step_output = input.page_distortion_step_output 1101 page_image = page_distortion_step_output.page_image 1102 page_char_polygon_collection = page_distortion_step_output.page_char_polygon_collection 1103 page_disconnected_text_region_collection = \ 1104 page_distortion_step_output.page_disconnected_text_region_collection 1105 page_non_text_region_collection = \ 1106 page_distortion_step_output.page_non_text_region_collection 1107 1108 page_resizing_step_output = input.page_resizing_step_output 1109 page_resized_text_line_mask = page_resizing_step_output.page_text_line_mask 1110 1111 debug = None 1112 if self.config.enable_debug: 1113 debug = PageTextRegionStepDebug() 1114 1115 # Build R-tree to track text regions. 1116 disconnected_text_region_polygons: List[Polygon] = [] 1117 disconnected_text_region_shapely_polygons: List[ShapelyPolygon] = [] 1118 for polygon in page_disconnected_text_region_collection.to_polygons(): 1119 disconnected_text_region_polygons.append(polygon) 1120 shapely_polygon = polygon.to_shapely_polygon() 1121 disconnected_text_region_shapely_polygons.append(shapely_polygon) 1122 1123 disconnected_text_region_tree = STRtree(disconnected_text_region_shapely_polygons) 1124 1125 # Get the precise text regions. 1126 precise_text_region_candidate_polygons: List[Polygon] = [] 1127 for resized_precise_polygon in page_resized_text_line_mask.to_disconnected_polygons(): 1128 # Resize back to the shape after distortion. 1129 precise_polygon = resized_precise_polygon.to_conducted_resized_polygon( 1130 page_resized_text_line_mask, 1131 resized_height=page_image.height, 1132 resized_width=page_image.width, 1133 ) 1134 1135 # Find and extract intersected text region. 1136 # NOTE: One precise_polygon could be overlapped with 1137 # more than one disconnected_text_region_polygon! 1138 for _, _, disconnected_text_region_mask, precise_mask, _ in \ 1139 self.strtree_query_intersected_polygons( 1140 strtree=disconnected_text_region_tree, 1141 anchor_polygons=disconnected_text_region_polygons, 1142 candidate_polygon=precise_polygon, 1143 ): 1144 precise_text_region_candidate_polygons.extend( 1145 self.generate_precise_text_region_candidate_polygons( 1146 precise_mask=precise_mask, 1147 disconnected_text_region_mask=disconnected_text_region_mask, 1148 ) 1149 ) 1150 1151 if debug: 1152 debug.page_image = page_image 1153 debug.precise_text_region_candidate_polygons = precise_text_region_candidate_polygons 1154 1155 # Help gc. 1156 del disconnected_text_region_polygons 1157 del disconnected_text_region_shapely_polygons 1158 del disconnected_text_region_tree 1159 1160 # Bind char-level polygon to precise text region. 1161 precise_text_region_polygons: List[Polygon] = [] 1162 precise_text_region_shapely_polygons: List[ShapelyPolygon] = [] 1163 1164 for polygon in precise_text_region_candidate_polygons: 1165 precise_text_region_polygons.append(polygon) 1166 shapely_polygon = polygon.to_shapely_polygon() 1167 precise_text_region_shapely_polygons.append(shapely_polygon) 1168 1169 precise_text_region_tree = STRtree(precise_text_region_shapely_polygons) 1170 1171 if not self.config.use_adjusted_char_polygons: 1172 selected_char_polygons = page_char_polygon_collection.char_polygons 1173 else: 1174 selected_char_polygons = page_char_polygon_collection.adjusted_char_polygons 1175 1176 ptrp_idx_to_char_polygons: DefaultDict[int, List[Polygon]] = defaultdict(list) 1177 1178 for char_polygon in selected_char_polygons: 1179 best_precise_text_region_polygon_idx = None 1180 intersected_ratio_max = 0 1181 1182 for ( 1183 precise_text_region_polygon_idx, 1184 _, 1185 _, 1186 _, 1187 intersected_ratio, 1188 ) in self.strtree_query_intersected_polygons( 1189 strtree=precise_text_region_tree, 1190 anchor_polygons=precise_text_region_polygons, 1191 candidate_polygon=char_polygon, 1192 ): 1193 if intersected_ratio > intersected_ratio_max: 1194 intersected_ratio_max = intersected_ratio 1195 best_precise_text_region_polygon_idx = precise_text_region_polygon_idx 1196 1197 if best_precise_text_region_polygon_idx is not None: 1198 ptrp_idx_to_char_polygons[best_precise_text_region_polygon_idx].append(char_polygon) 1199 else: 1200 # NOTE: Text line with only a small char (i.e. delimiter) could enter this branch. 1201 # In such case, the text line bounding box is smaller than the char polygon, since 1202 # the leading/trailing char paddings are ignored during text line rendering. 1203 # It's acceptable for now since: 1) this case happens rarely, 2) and it won't 1204 # introduce labeling noise. 1205 logger.warning(f'Cannot assign a text region for char_polygon={char_polygon}') 1206 1207 page_text_region_infos: List[PageTextRegionInfo] = [] 1208 for ptrp_idx, precise_text_region_polygon in enumerate(precise_text_region_polygons): 1209 if ptrp_idx not in ptrp_idx_to_char_polygons: 1210 continue 1211 page_text_region_infos.append( 1212 PageTextRegionInfo( 1213 precise_text_region_polygon=precise_text_region_polygon, 1214 char_polygons=ptrp_idx_to_char_polygons[ptrp_idx], 1215 ) 1216 ) 1217 1218 # Help gc. 1219 del precise_text_region_polygons 1220 del precise_text_region_shapely_polygons 1221 del precise_text_region_tree 1222 1223 if debug: 1224 debug.page_text_region_infos = page_text_region_infos 1225 1226 # Negative sampling. 1227 page_non_text_region_polygons = self.sample_page_non_text_region_polygons( 1228 page_non_text_region_polygons=tuple(page_non_text_region_collection.to_polygons()), 1229 num_page_text_region_infos=len(page_text_region_infos), 1230 rng=rng, 1231 ) 1232 1233 flattened_text_regions = self.build_flattened_text_regions( 1234 page_image=page_image, 1235 page_text_region_infos=page_text_region_infos, 1236 page_non_text_region_polygons=page_non_text_region_polygons, 1237 rng=rng, 1238 ) 1239 if debug: 1240 debug.flattened_text_regions = flattened_text_regions 1241 1242 # Stack text regions. 1243 ( 1244 image, 1245 active_mask, 1246 text_region_boxes, 1247 char_polygons, 1248 char_polygon_text_region_box_indices, 1249 ) = stack_flattened_text_regions( 1250 page_pad=0, 1251 flattened_text_regions_pad=self.config.stack_flattened_text_regions_pad, 1252 flattened_text_regions=flattened_text_regions, 1253 ) 1254 1255 text_region_polygons = [ 1256 text_region_box.to_polygon() for text_region_box in text_region_boxes 1257 ] 1258 1259 # Post uniform rotation. 1260 shape_before_rotate = image.shape 1261 rotate_angle = 0 1262 1263 if rng.random() < self.config.prob_post_rotate_90_angle: 1264 rotate_angle = 90 1265 1266 if rng.random() < self.config.prob_post_rotate_random_angle: 1267 rotate_angle += int( 1268 rng.integers( 1269 self.config.post_rotate_random_angle_min, 1270 self.config.post_rotate_random_angle_max + 1, 1271 ) 1272 ) 1273 1274 if rotate_angle != 0: 1275 # For unpacking. 1276 num_char_polygons = len(char_polygons) 1277 rotated_result = rotate.distort( 1278 {'angle': rotate_angle}, 1279 image=image, 1280 mask=active_mask, 1281 polygons=(*char_polygons, *text_region_polygons), 1282 ) 1283 assert rotated_result.image and rotated_result.mask and rotated_result.polygons 1284 image = rotated_result.image 1285 active_mask = rotated_result.mask 1286 char_polygons = rotated_result.polygons[:num_char_polygons] 1287 text_region_polygons = rotated_result.polygons[num_char_polygons:] 1288 1289 return PageTextRegionStepOutput( 1290 page_image=image, 1291 page_active_mask=active_mask, 1292 page_char_polygons=char_polygons, 1293 page_text_region_polygons=text_region_polygons, 1294 page_char_polygon_text_region_polygon_indices=char_polygon_text_region_box_indices, 1295 shape_before_rotate=shape_before_rotate, 1296 rotate_angle=rotate_angle, 1297 debug=debug, 1298 ) 1299 1300 1301page_text_region_step_factory = PipelineStepFactory(PageTextRegionStep)
41class PageTextRegionStepConfig: 42 use_adjusted_char_polygons: bool = False 43 prob_drop_single_char_page_text_region_info: float = 0.5 44 text_region_flattener_typical_long_side_ratio_min: float = 3.0 45 text_region_flattener_text_region_polygon_dilate_ratio_min: float = 0.85 46 text_region_flattener_text_region_polygon_dilate_ratio_max: float = 1.0 47 text_region_resize_char_height_median_min: int = 32 48 text_region_resize_char_height_median_max: int = 46 49 prob_text_region_typical_post_rotate: float = 0.2 50 prob_text_region_untypical_post_rotate: float = 0.2 51 negative_text_region_ratio: float = 0.1 52 prob_negative_text_region_post_rotate: float = 0.2 53 stack_flattened_text_regions_pad: int = 2 54 prob_post_rotate_90_angle: float = 0.5 55 prob_post_rotate_random_angle: float = 0.0 56 post_rotate_random_angle_min: int = -5 57 post_rotate_random_angle_max: int = 5 58 enable_debug: bool = False
2def __init__(self, use_adjusted_char_polygons=attr_dict['use_adjusted_char_polygons'].default, prob_drop_single_char_page_text_region_info=attr_dict['prob_drop_single_char_page_text_region_info'].default, text_region_flattener_typical_long_side_ratio_min=attr_dict['text_region_flattener_typical_long_side_ratio_min'].default, text_region_flattener_text_region_polygon_dilate_ratio_min=attr_dict['text_region_flattener_text_region_polygon_dilate_ratio_min'].default, text_region_flattener_text_region_polygon_dilate_ratio_max=attr_dict['text_region_flattener_text_region_polygon_dilate_ratio_max'].default, text_region_resize_char_height_median_min=attr_dict['text_region_resize_char_height_median_min'].default, text_region_resize_char_height_median_max=attr_dict['text_region_resize_char_height_median_max'].default, prob_text_region_typical_post_rotate=attr_dict['prob_text_region_typical_post_rotate'].default, prob_text_region_untypical_post_rotate=attr_dict['prob_text_region_untypical_post_rotate'].default, negative_text_region_ratio=attr_dict['negative_text_region_ratio'].default, prob_negative_text_region_post_rotate=attr_dict['prob_negative_text_region_post_rotate'].default, stack_flattened_text_regions_pad=attr_dict['stack_flattened_text_regions_pad'].default, prob_post_rotate_90_angle=attr_dict['prob_post_rotate_90_angle'].default, prob_post_rotate_random_angle=attr_dict['prob_post_rotate_random_angle'].default, post_rotate_random_angle_min=attr_dict['post_rotate_random_angle_min'].default, post_rotate_random_angle_max=attr_dict['post_rotate_random_angle_max'].default, enable_debug=attr_dict['enable_debug'].default): 3 self.use_adjusted_char_polygons = use_adjusted_char_polygons 4 self.prob_drop_single_char_page_text_region_info = prob_drop_single_char_page_text_region_info 5 self.text_region_flattener_typical_long_side_ratio_min = text_region_flattener_typical_long_side_ratio_min 6 self.text_region_flattener_text_region_polygon_dilate_ratio_min = text_region_flattener_text_region_polygon_dilate_ratio_min 7 self.text_region_flattener_text_region_polygon_dilate_ratio_max = text_region_flattener_text_region_polygon_dilate_ratio_max 8 self.text_region_resize_char_height_median_min = text_region_resize_char_height_median_min 9 self.text_region_resize_char_height_median_max = text_region_resize_char_height_median_max 10 self.prob_text_region_typical_post_rotate = prob_text_region_typical_post_rotate 11 self.prob_text_region_untypical_post_rotate = prob_text_region_untypical_post_rotate 12 self.negative_text_region_ratio = negative_text_region_ratio 13 self.prob_negative_text_region_post_rotate = prob_negative_text_region_post_rotate 14 self.stack_flattened_text_regions_pad = stack_flattened_text_regions_pad 15 self.prob_post_rotate_90_angle = prob_post_rotate_90_angle 16 self.prob_post_rotate_random_angle = prob_post_rotate_random_angle 17 self.post_rotate_random_angle_min = post_rotate_random_angle_min 18 self.post_rotate_random_angle_max = post_rotate_random_angle_max 19 self.enable_debug = enable_debug
Method generated by attrs for class PageTextRegionStepConfig.
62class PageTextRegionStepInput: 63 page_distortion_step_output: PageDistortionStepOutput 64 page_resizing_step_output: PageResizingStepOutput
2def __init__(self, page_distortion_step_output, page_resizing_step_output): 3 self.page_distortion_step_output = page_distortion_step_output 4 self.page_resizing_step_output = page_resizing_step_output
Method generated by attrs for class PageTextRegionStepInput.
68class PageTextRegionInfo: 69 precise_text_region_polygon: Polygon 70 char_polygons: Sequence[Polygon]
2def __init__(self, precise_text_region_polygon, char_polygons): 3 self.precise_text_region_polygon = precise_text_region_polygon 4 self.char_polygons = char_polygons
Method generated by attrs for class PageTextRegionInfo.
74class FlattenedTextRegion: 75 is_typical: bool 76 text_region_polygon: Polygon 77 text_region_image: Image 78 bounding_extended_text_region_mask: Mask 79 flattening_rotate_angle: int 80 shape_before_trim: Tuple[int, int] 81 rotated_trimmed_box: Box 82 shape_before_resize: Tuple[int, int] 83 post_rotate_angle: int 84 flattened_image: Image 85 flattened_mask: Mask 86 flattened_char_polygons: Optional[Sequence[Polygon]] 87 88 @property 89 def shape(self): 90 return self.flattened_image.shape 91 92 @property 93 def height(self): 94 return self.flattened_image.height 95 96 @property 97 def width(self): 98 return self.flattened_image.width 99 100 @property 101 def area(self): 102 return self.flattened_image.area 103 104 def get_char_height_meidan(self): 105 assert self.flattened_char_polygons 106 return statistics.median( 107 char_polygon.get_rectangular_height() for char_polygon in self.flattened_char_polygons 108 ) 109 110 def to_resized_flattened_text_region( 111 self, 112 resized_height: Optional[int] = None, 113 resized_width: Optional[int] = None, 114 ): 115 resized_flattened_image = self.flattened_image.to_resized_image( 116 resized_height=resized_height, 117 resized_width=resized_width, 118 ) 119 120 resized_flattened_mask = self.flattened_mask.to_resized_mask( 121 resized_height=resized_height, 122 resized_width=resized_width, 123 ) 124 125 resized_flattened_char_polygons = None 126 if self.flattened_char_polygons is not None: 127 resized_flattened_char_polygons = [ 128 flattened_char_polygon.to_conducted_resized_polygon( 129 self.shape, 130 resized_height=resized_height, 131 resized_width=resized_width, 132 ) for flattened_char_polygon in self.flattened_char_polygons 133 ] 134 135 return attrs.evolve( 136 self, 137 flattened_image=resized_flattened_image, 138 flattened_mask=resized_flattened_mask, 139 flattened_char_polygons=resized_flattened_char_polygons, 140 ) 141 142 def to_post_rotated_flattened_text_region( 143 self, 144 post_rotate_angle: int, 145 ): 146 assert self.post_rotate_angle == 0 147 148 # NOTE: No need to trim. 149 rotated_result = rotate.distort( 150 {'angle': post_rotate_angle}, 151 image=self.flattened_image, 152 mask=self.flattened_mask, 153 polygons=self.flattened_char_polygons, 154 ) 155 rotated_flattened_image = rotated_result.image 156 assert rotated_flattened_image 157 rotated_flattened_mask = rotated_result.mask 158 assert rotated_flattened_mask 159 rotated_flattened_char_polygons = rotated_result.polygons 160 161 return attrs.evolve( 162 self, 163 post_rotate_angle=post_rotate_angle, 164 flattened_image=rotated_flattened_image, 165 flattened_mask=rotated_flattened_mask, 166 flattened_char_polygons=rotated_flattened_char_polygons, 167 )
2def __init__(self, is_typical, text_region_polygon, text_region_image, bounding_extended_text_region_mask, flattening_rotate_angle, shape_before_trim, rotated_trimmed_box, shape_before_resize, post_rotate_angle, flattened_image, flattened_mask, flattened_char_polygons): 3 self.is_typical = is_typical 4 self.text_region_polygon = text_region_polygon 5 self.text_region_image = text_region_image 6 self.bounding_extended_text_region_mask = bounding_extended_text_region_mask 7 self.flattening_rotate_angle = flattening_rotate_angle 8 self.shape_before_trim = shape_before_trim 9 self.rotated_trimmed_box = rotated_trimmed_box 10 self.shape_before_resize = shape_before_resize 11 self.post_rotate_angle = post_rotate_angle 12 self.flattened_image = flattened_image 13 self.flattened_mask = flattened_mask 14 self.flattened_char_polygons = flattened_char_polygons
Method generated by attrs for class FlattenedTextRegion.
110 def to_resized_flattened_text_region( 111 self, 112 resized_height: Optional[int] = None, 113 resized_width: Optional[int] = None, 114 ): 115 resized_flattened_image = self.flattened_image.to_resized_image( 116 resized_height=resized_height, 117 resized_width=resized_width, 118 ) 119 120 resized_flattened_mask = self.flattened_mask.to_resized_mask( 121 resized_height=resized_height, 122 resized_width=resized_width, 123 ) 124 125 resized_flattened_char_polygons = None 126 if self.flattened_char_polygons is not None: 127 resized_flattened_char_polygons = [ 128 flattened_char_polygon.to_conducted_resized_polygon( 129 self.shape, 130 resized_height=resized_height, 131 resized_width=resized_width, 132 ) for flattened_char_polygon in self.flattened_char_polygons 133 ] 134 135 return attrs.evolve( 136 self, 137 flattened_image=resized_flattened_image, 138 flattened_mask=resized_flattened_mask, 139 flattened_char_polygons=resized_flattened_char_polygons, 140 )
142 def to_post_rotated_flattened_text_region( 143 self, 144 post_rotate_angle: int, 145 ): 146 assert self.post_rotate_angle == 0 147 148 # NOTE: No need to trim. 149 rotated_result = rotate.distort( 150 {'angle': post_rotate_angle}, 151 image=self.flattened_image, 152 mask=self.flattened_mask, 153 polygons=self.flattened_char_polygons, 154 ) 155 rotated_flattened_image = rotated_result.image 156 assert rotated_flattened_image 157 rotated_flattened_mask = rotated_result.mask 158 assert rotated_flattened_mask 159 rotated_flattened_char_polygons = rotated_result.polygons 160 161 return attrs.evolve( 162 self, 163 post_rotate_angle=post_rotate_angle, 164 flattened_image=rotated_flattened_image, 165 flattened_mask=rotated_flattened_mask, 166 flattened_char_polygons=rotated_flattened_char_polygons, 167 )
171class PageTextRegionStepDebug: 172 page_image: Image = attrs.field(default=None) 173 precise_text_region_candidate_polygons: Sequence[Polygon] = attrs.field(default=None) 174 page_text_region_infos: Sequence[PageTextRegionInfo] = attrs.field(default=None) 175 flattened_text_regions: Sequence[FlattenedTextRegion] = attrs.field(default=None)
2def __init__(self, page_image=attr_dict['page_image'].default, precise_text_region_candidate_polygons=attr_dict['precise_text_region_candidate_polygons'].default, page_text_region_infos=attr_dict['page_text_region_infos'].default, flattened_text_regions=attr_dict['flattened_text_regions'].default): 3 self.page_image = page_image 4 self.precise_text_region_candidate_polygons = precise_text_region_candidate_polygons 5 self.page_text_region_infos = page_text_region_infos 6 self.flattened_text_regions = flattened_text_regions
Method generated by attrs for class PageTextRegionStepDebug.
179class PageTextRegionStepOutput: 180 page_image: Image 181 page_active_mask: Mask 182 page_char_polygons: Sequence[Polygon] 183 page_text_region_polygons: Sequence[Polygon] 184 page_char_polygon_text_region_polygon_indices: Sequence[int] 185 shape_before_rotate: Tuple[int, int] 186 rotate_angle: int 187 debug: Optional[PageTextRegionStepDebug]
2def __init__(self, page_image, page_active_mask, page_char_polygons, page_text_region_polygons, page_char_polygon_text_region_polygon_indices, shape_before_rotate, rotate_angle, debug): 3 self.page_image = page_image 4 self.page_active_mask = page_active_mask 5 self.page_char_polygons = page_char_polygons 6 self.page_text_region_polygons = page_text_region_polygons 7 self.page_char_polygon_text_region_polygon_indices = page_char_polygon_text_region_polygon_indices 8 self.shape_before_rotate = shape_before_rotate 9 self.rotate_angle = rotate_angle 10 self.debug = debug
Method generated by attrs for class PageTextRegionStepOutput.
190def calculate_boxed_masks_intersected_ratio( 191 anchor_mask: Mask, 192 candidate_mask: Mask, 193 use_candidate_as_base: bool = False, 194): 195 anchor_box = anchor_mask.box 196 assert anchor_box 197 198 candidate_box = candidate_mask.box 199 assert candidate_box 200 201 # Calculate intersection. 202 up = max(anchor_box.up, candidate_box.up) 203 down = min(anchor_box.down, candidate_box.down) 204 left = max(anchor_box.left, candidate_box.left) 205 right = min(anchor_box.right, candidate_box.right) 206 207 if up > down or left > right: 208 return 0.0 209 210 np_intersected_anchor_mask = anchor_mask.mat[ 211 up - anchor_box.up:down - anchor_box.up + 1, 212 left - anchor_box.left:right - anchor_box.left + 1, 213 ] # yapf: disable 214 np_intersected_candidate_mask = candidate_mask.mat[ 215 up - candidate_box.up:down - candidate_box.up + 1, 216 left - candidate_box.left:right - candidate_box.left + 1, 217 ] # yapf: disable 218 np_intersected_mask = np_intersected_anchor_mask & np_intersected_candidate_mask 219 intersected_area = int(np_intersected_mask.sum()) 220 221 if use_candidate_as_base: 222 base_area = int(candidate_mask.np_mask.sum()) 223 else: 224 base_area = ( 225 int(anchor_mask.np_mask.sum()) + int(candidate_mask.np_mask.sum()) - intersected_area 226 ) 227 228 return intersected_area / base_area
231class TextRegionFlattener: 232 233 @classmethod 234 def patch_text_region_polygons( 235 cls, 236 text_region_polygons: Sequence[Polygon], 237 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 238 ): 239 if grouped_char_polygons is None: 240 return text_region_polygons 241 242 assert len(text_region_polygons) == len(grouped_char_polygons) 243 244 patched_text_region_polygons: List[Polygon] = [] 245 for text_region_polygon, char_polygons in zip(text_region_polygons, grouped_char_polygons): 246 # Need to make sure all char polygons are included. 247 unionized_polygons = [text_region_polygon] 248 unionized_polygons.extend(char_polygons) 249 250 bounding_box = Box.from_boxes((polygon.bounding_box for polygon in unionized_polygons)) 251 mask = Mask.from_shapable(bounding_box).to_box_attached(bounding_box) 252 for polygon in unionized_polygons: 253 polygon.fill_mask(mask) 254 255 patched_text_region_polygons.append(mask.to_external_polygon()) 256 257 return patched_text_region_polygons 258 259 @classmethod 260 def get_dilated_and_bounding_rectangular_polygons( 261 cls, 262 text_region_polygon_dilate_ratio: float, 263 shape: Tuple[int, int], 264 text_region_polygons: Sequence[Polygon], 265 force_no_dilation_flags: Optional[Sequence[bool]] = None, 266 ): 267 dilated_text_region_polygons: List[Polygon] = [] 268 bounding_rectangular_polygons: List[Polygon] = [] 269 270 if force_no_dilation_flags is None: 271 force_no_dilation_flags_iter = itertools.repeat(False) 272 else: 273 assert len(force_no_dilation_flags) == len(text_region_polygons) 274 force_no_dilation_flags_iter = force_no_dilation_flags 275 276 for text_region_polygon, force_no_dilation_flag in zip( 277 text_region_polygons, force_no_dilation_flags_iter 278 ): 279 280 if not force_no_dilation_flag: 281 # Dilate. 282 text_region_polygon = text_region_polygon.to_dilated_polygon( 283 ratio=text_region_polygon_dilate_ratio, 284 ) 285 text_region_polygon = text_region_polygon.to_clipped_polygon(shape) 286 287 dilated_text_region_polygons.append(text_region_polygon) 288 bounding_rectangular_polygons.append( 289 text_region_polygon.to_bounding_rectangular_polygon(shape) 290 ) 291 292 return dilated_text_region_polygons, bounding_rectangular_polygons 293 294 @classmethod 295 def analyze_bounding_rectangular_polygons( 296 cls, 297 bounding_rectangular_polygons: Sequence[Polygon], 298 ): 299 short_side_lengths: List[float] = [] 300 long_side_ratios: List[float] = [] 301 long_side_angles: List[int] = [] 302 303 for polygon in bounding_rectangular_polygons: 304 # Get reference line. 305 point0, point1, _, point3 = polygon.points 306 side0_length = math.hypot( 307 point0.smooth_y - point1.smooth_y, 308 point0.smooth_x - point1.smooth_x, 309 ) 310 side1_length = math.hypot( 311 point0.smooth_y - point3.smooth_y, 312 point0.smooth_x - point3.smooth_x, 313 ) 314 315 # Get the short side length. 316 short_side_lengths.append(min(side0_length, side1_length)) 317 318 long_side_ratios.append( 319 max(side0_length, side1_length) / min(side0_length, side1_length) 320 ) 321 322 point_a = point0 323 if side0_length > side1_length: 324 # Reference line (p0 -> p1). 325 point_b = point1 326 else: 327 # Reference line (p0 -> p3). 328 point_b = point3 329 330 # Get the angle of reference line, in [0, 180) degree. 331 np_theta = np.arctan2( 332 point_a.smooth_y - point_b.smooth_y, 333 point_a.smooth_x - point_b.smooth_x, 334 ) 335 np_theta = np_theta % np.pi 336 long_side_angle = round(np_theta / np.pi * 180) % 180 337 long_side_angles.append(long_side_angle) 338 339 return short_side_lengths, long_side_ratios, long_side_angles 340 341 @classmethod 342 def get_typical_indices( 343 cls, 344 typical_long_side_ratio_min: float, 345 long_side_ratios: Sequence[float], 346 ): 347 return tuple( 348 idx for idx, long_side_ratio in enumerate(long_side_ratios) 349 if long_side_ratio >= typical_long_side_ratio_min 350 ) 351 352 @classmethod 353 def check_first_text_region_polygon_is_larger( 354 cls, 355 text_region_polygons: Sequence[Polygon], 356 short_side_lengths: Sequence[float], 357 first_idx: int, 358 second_idx: int, 359 ): 360 first_text_region_polygon = text_region_polygons[first_idx] 361 second_text_region_polygon = text_region_polygons[second_idx] 362 363 # The short side indicates the text line height. 364 first_short_side_length = short_side_lengths[first_idx] 365 second_short_side_length = short_side_lengths[second_idx] 366 367 return ( 368 first_text_region_polygon.area >= second_text_region_polygon.area 369 and first_short_side_length >= second_short_side_length 370 ) 371 372 @classmethod 373 def get_main_and_flattening_rotate_angles( 374 cls, 375 text_region_polygons: Sequence[Polygon], 376 typical_indices: Sequence[int], 377 short_side_lengths: Sequence[float], 378 long_side_angles: Sequence[int], 379 ): 380 typical_indices_set = set(typical_indices) 381 text_region_center_points = [ 382 text_region_polygon.get_center_point() for text_region_polygon in text_region_polygons 383 ] 384 385 main_angles: List[Optional[int]] = [None] * len(long_side_angles) 386 387 # 1. For typical indices, or if no typical indices. 388 for idx, long_side_angle in enumerate(long_side_angles): 389 if not typical_indices_set or idx in typical_indices_set: 390 main_angles[idx] = long_side_angle 391 392 # 2. For nontypcial indices. 393 if typical_indices_set: 394 typical_center_points = PointList( 395 text_region_center_points[idx] for idx in typical_indices 396 ) 397 kd_tree = KDTree(typical_center_points.to_np_array()) 398 399 nontypical_indices = tuple( 400 idx for idx, _ in enumerate(long_side_angles) if idx not in typical_indices_set 401 ) 402 nontypical_center_points = PointList( 403 text_region_center_points[idx] for idx in nontypical_indices 404 ) 405 406 # Set main angle as the closest typical angle. 407 # Round 1: Set if the closest typical polygon is large enough. 408 _, np_kd_nbr_indices = kd_tree.query(nontypical_center_points.to_np_array()) 409 round2_nontypical_indices: List[int] = [] 410 for nontypical_idx, typical_indices_idx in zip( 411 nontypical_indices, 412 np_kd_nbr_indices[:, 0].tolist(), 413 ): 414 typical_idx = typical_indices[typical_indices_idx] 415 if cls.check_first_text_region_polygon_is_larger( 416 text_region_polygons=text_region_polygons, 417 short_side_lengths=short_side_lengths, 418 first_idx=typical_idx, 419 second_idx=nontypical_idx, 420 ): 421 main_angles[nontypical_idx] = main_angles[typical_idx] 422 else: 423 round2_nontypical_indices.append(nontypical_idx) 424 425 # Round 2: Searching the closest typical polygon that has larger area. 426 round3_nontypical_indices: List[int] = [] 427 if round2_nontypical_indices: 428 round2_nontypical_center_points = PointList( 429 text_region_center_points[idx] for idx in round2_nontypical_indices 430 ) 431 _, np_kd_nbr_indices = kd_tree.query( 432 round2_nontypical_center_points.to_np_array(), 433 k=len(typical_center_points), 434 ) 435 for nontypical_idx, typical_indices_indices in zip( 436 round2_nontypical_indices, 437 np_kd_nbr_indices.tolist(), 438 ): 439 hit_typical_idx = None 440 for typical_indices_idx in typical_indices_indices: 441 typical_idx = typical_indices[typical_indices_idx] 442 if cls.check_first_text_region_polygon_is_larger( 443 text_region_polygons=text_region_polygons, 444 short_side_lengths=short_side_lengths, 445 first_idx=typical_idx, 446 second_idx=nontypical_idx, 447 ): 448 hit_typical_idx = typical_idx 449 break 450 451 if hit_typical_idx is not None: 452 main_angles[nontypical_idx] = main_angles[hit_typical_idx] 453 else: 454 round3_nontypical_indices.append(nontypical_idx) 455 456 # Round 3: Last resort. Set to the median of typical angles. 457 if round3_nontypical_indices: 458 main_angles_median = statistics.median_low( 459 long_side_angles[typical_idx] for typical_idx in typical_indices 460 ) 461 for nontypical_idx in round3_nontypical_indices: 462 main_angles[nontypical_idx] = main_angles_median 463 464 # 3. Get angle for flattening. 465 flattening_rotate_angles: List[int] = [] 466 for main_angle in main_angles: 467 assert main_angle is not None 468 if main_angle <= 90: 469 # [270, 360). 470 flattening_rotate_angle = (360 - main_angle) % 360 471 else: 472 # [1, 90). 473 flattening_rotate_angle = 180 - main_angle 474 flattening_rotate_angles.append(flattening_rotate_angle) 475 476 return cast(List[int], main_angles), flattening_rotate_angles 477 478 @classmethod 479 def get_bounding_extended_text_region_masks( 480 cls, 481 shape: Tuple[int, int], 482 text_region_polygons: Sequence[Polygon], 483 dilated_text_region_polygons: Sequence[Polygon], 484 bounding_rectangular_polygons: Sequence[Polygon], 485 typical_indices: Sequence[int], 486 main_angles: Sequence[int], 487 ): 488 typical_indices_set = set(typical_indices) 489 490 text_mask = Mask.from_polygons(shape, text_region_polygons) 491 non_text_mask = text_mask.to_inverted_mask() 492 493 box = Box.from_shape(shape) 494 text_mask = text_mask.to_box_attached(box) 495 non_text_mask = non_text_mask.to_box_attached(box) 496 497 bounding_extended_text_region_masks: List[Mask] = [] 498 499 num_text_region_polygons = len(text_region_polygons) 500 for idx in range(num_text_region_polygons): 501 text_region_polygon = text_region_polygons[idx] 502 dilated_text_region_polygon = dilated_text_region_polygons[idx] 503 bounding_rectangular_polygon = bounding_rectangular_polygons[idx] 504 505 if typical_indices_set and idx not in typical_indices_set: 506 # Patch bounding rectangular polygon if is nontypical. 507 main_angle = main_angles[idx] 508 bounding_rectangular_polygon = \ 509 dilated_text_region_polygon.to_bounding_rectangular_polygon( 510 shape=shape, 511 angle=main_angle, 512 ) 513 514 # See the comment in Polygon.to_bounding_rectangular_polygon. 515 bounding_box = Box.from_boxes(( 516 dilated_text_region_polygon.bounding_box, 517 bounding_rectangular_polygon.bounding_box, 518 )) 519 520 # Fill other text region. 521 bounding_other_text_mask = \ 522 Mask.from_shapable(bounding_box).to_box_attached(bounding_box) 523 # Copy from text mask. 524 bounding_rectangular_polygon.fill_mask(bounding_other_text_mask, text_mask) 525 # Use the original text region polygon to unset the current text mask. 526 text_region_polygon.fill_mask(bounding_other_text_mask, 0) 527 528 # Fill protentially dilated text region. 529 bounding_text_mask = \ 530 Mask.from_shapable(bounding_other_text_mask).to_box_attached(bounding_box) 531 # Use the protentially dilated text region polygon to set the current text mask. 532 dilated_text_region_polygon.fill_mask(bounding_text_mask, value=1) 533 534 del dilated_text_region_polygon 535 536 # Trim protentially dilated text region polygon by eliminating other text region. 537 bounding_trimmed_text_mask = Mask.from_masks( 538 bounding_box, 539 [ 540 # Includes the protentially dilated text region. 541 bounding_text_mask, 542 # But not includes any other text regions. 543 bounding_other_text_mask.to_inverted_mask(), 544 ], 545 ElementSetOperationMode.INTERSECT, 546 ) 547 548 # Extract non-text region. 549 bounding_non_text_mask = bounding_rectangular_polygon.extract_mask(non_text_mask) 550 551 # Unionize trimmed text region and non-text region. 552 bounding_extended_text_region_mask = Mask.from_masks( 553 bounding_box, 554 [bounding_trimmed_text_mask, bounding_non_text_mask], 555 ) 556 557 bounding_extended_text_region_masks.append(bounding_extended_text_region_mask) 558 559 return bounding_extended_text_region_masks 560 561 @classmethod 562 def build_flattened_text_regions( 563 cls, 564 image: Image, 565 text_region_polygons: Sequence[Polygon], 566 bounding_extended_text_region_masks: Sequence[Mask], 567 typical_indices: Sequence[int], 568 flattening_rotate_angles: Sequence[int], 569 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 570 ): 571 typical_indices_set = set(typical_indices) 572 573 flattened_text_regions: List[FlattenedTextRegion] = [] 574 575 for idx, ( 576 text_region_polygon, 577 bounding_extended_text_region_mask, 578 flattening_rotate_angle, 579 ) in enumerate( 580 zip( 581 text_region_polygons, 582 bounding_extended_text_region_masks, 583 flattening_rotate_angles, 584 ) 585 ): 586 bounding_box = bounding_extended_text_region_mask.box 587 assert bounding_box 588 589 # Extract image. 590 text_region_image = bounding_extended_text_region_mask.extract_image(image) 591 592 # Shift char polygons. 593 relative_char_polygons = None 594 if grouped_char_polygons is not None: 595 char_polygons = grouped_char_polygons[idx] 596 relative_char_polygons = [ 597 char_polygon.to_relative_polygon( 598 origin_y=bounding_box.up, 599 origin_x=bounding_box.left, 600 ) for char_polygon in char_polygons 601 ] 602 603 # Rotate. 604 rotated_result = rotate.distort( 605 {'angle': flattening_rotate_angle}, 606 image=text_region_image, 607 mask=bounding_extended_text_region_mask, 608 polygons=relative_char_polygons, 609 ) 610 rotated_text_region_image = rotated_result.image 611 assert rotated_text_region_image 612 rotated_bounding_extended_text_region_mask = rotated_result.mask 613 assert rotated_bounding_extended_text_region_mask 614 # Could be None. 615 rotated_char_polygons = rotated_result.polygons 616 617 # Trim. 618 rotated_trimmed_box = rotated_bounding_extended_text_region_mask.to_external_box() 619 620 trimmed_text_region_image = rotated_text_region_image.to_cropped_image( 621 up=rotated_trimmed_box.up, 622 down=rotated_trimmed_box.down, 623 left=rotated_trimmed_box.left, 624 right=rotated_trimmed_box.right, 625 ) 626 627 trimmed_mask = rotated_trimmed_box.extract_mask( 628 rotated_bounding_extended_text_region_mask 629 ) 630 631 trimmed_char_polygons = None 632 if rotated_char_polygons: 633 trimmed_char_polygons = [ 634 rotated_char_polygon.to_relative_polygon( 635 origin_y=rotated_trimmed_box.up, 636 origin_x=rotated_trimmed_box.left, 637 ) for rotated_char_polygon in rotated_char_polygons 638 ] 639 640 flattened_text_regions.append( 641 FlattenedTextRegion( 642 is_typical=(idx in typical_indices_set), 643 text_region_polygon=text_region_polygon, 644 text_region_image=bounding_extended_text_region_mask.extract_image(image), 645 bounding_extended_text_region_mask=bounding_extended_text_region_mask, 646 flattening_rotate_angle=flattening_rotate_angle, 647 shape_before_trim=rotated_text_region_image.shape, 648 rotated_trimmed_box=rotated_trimmed_box, 649 shape_before_resize=trimmed_text_region_image.shape, 650 post_rotate_angle=0, 651 flattened_image=trimmed_text_region_image, 652 flattened_mask=trimmed_mask, 653 flattened_char_polygons=trimmed_char_polygons, 654 ) 655 ) 656 657 return flattened_text_regions 658 659 def __init__( 660 self, 661 typical_long_side_ratio_min: float, 662 text_region_polygon_dilate_ratio: float, 663 image: Image, 664 text_region_polygons: Sequence[Polygon], 665 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]] = None, 666 is_training: bool = False, 667 ): 668 self.original_text_region_polygons = text_region_polygons 669 670 self.text_region_polygons = self.patch_text_region_polygons( 671 text_region_polygons=text_region_polygons, 672 grouped_char_polygons=grouped_char_polygons, 673 ) 674 675 force_no_dilation_flags = None 676 if is_training: 677 assert grouped_char_polygons and len(text_region_polygons) == len(grouped_char_polygons) 678 force_no_dilation_flags = [] 679 for char_polygons in grouped_char_polygons: 680 force_no_dilation_flags.append(not char_polygons) 681 682 ( 683 self.dilated_text_region_polygons, 684 self.bounding_rectangular_polygons, 685 ) = self.get_dilated_and_bounding_rectangular_polygons( 686 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 687 shape=image.shape, 688 text_region_polygons=self.text_region_polygons, 689 force_no_dilation_flags=force_no_dilation_flags, 690 ) 691 692 ( 693 self.short_side_lengths, 694 self.long_side_ratios, 695 self.long_side_angles, 696 ) = self.analyze_bounding_rectangular_polygons(self.bounding_rectangular_polygons) 697 698 self.typical_indices = self.get_typical_indices( 699 typical_long_side_ratio_min=typical_long_side_ratio_min, 700 long_side_ratios=self.long_side_ratios, 701 ) 702 703 ( 704 self.main_angles, 705 self.flattening_rotate_angles, 706 ) = self.get_main_and_flattening_rotate_angles( 707 text_region_polygons=self.text_region_polygons, 708 typical_indices=self.typical_indices, 709 short_side_lengths=self.short_side_lengths, 710 long_side_angles=self.long_side_angles, 711 ) 712 713 self.bounding_extended_text_region_masks = self.get_bounding_extended_text_region_masks( 714 shape=image.shape, 715 text_region_polygons=self.text_region_polygons, 716 dilated_text_region_polygons=self.dilated_text_region_polygons, 717 bounding_rectangular_polygons=self.bounding_rectangular_polygons, 718 typical_indices=self.typical_indices, 719 main_angles=self.main_angles, 720 ) 721 722 self.flattened_text_regions = self.build_flattened_text_regions( 723 image=image, 724 # NOTE: need to use the original text region polygons for reversed opts. 725 text_region_polygons=self.original_text_region_polygons, 726 bounding_extended_text_region_masks=self.bounding_extended_text_region_masks, 727 typical_indices=self.typical_indices, 728 flattening_rotate_angles=self.flattening_rotate_angles, 729 grouped_char_polygons=grouped_char_polygons, 730 )
659 def __init__( 660 self, 661 typical_long_side_ratio_min: float, 662 text_region_polygon_dilate_ratio: float, 663 image: Image, 664 text_region_polygons: Sequence[Polygon], 665 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]] = None, 666 is_training: bool = False, 667 ): 668 self.original_text_region_polygons = text_region_polygons 669 670 self.text_region_polygons = self.patch_text_region_polygons( 671 text_region_polygons=text_region_polygons, 672 grouped_char_polygons=grouped_char_polygons, 673 ) 674 675 force_no_dilation_flags = None 676 if is_training: 677 assert grouped_char_polygons and len(text_region_polygons) == len(grouped_char_polygons) 678 force_no_dilation_flags = [] 679 for char_polygons in grouped_char_polygons: 680 force_no_dilation_flags.append(not char_polygons) 681 682 ( 683 self.dilated_text_region_polygons, 684 self.bounding_rectangular_polygons, 685 ) = self.get_dilated_and_bounding_rectangular_polygons( 686 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 687 shape=image.shape, 688 text_region_polygons=self.text_region_polygons, 689 force_no_dilation_flags=force_no_dilation_flags, 690 ) 691 692 ( 693 self.short_side_lengths, 694 self.long_side_ratios, 695 self.long_side_angles, 696 ) = self.analyze_bounding_rectangular_polygons(self.bounding_rectangular_polygons) 697 698 self.typical_indices = self.get_typical_indices( 699 typical_long_side_ratio_min=typical_long_side_ratio_min, 700 long_side_ratios=self.long_side_ratios, 701 ) 702 703 ( 704 self.main_angles, 705 self.flattening_rotate_angles, 706 ) = self.get_main_and_flattening_rotate_angles( 707 text_region_polygons=self.text_region_polygons, 708 typical_indices=self.typical_indices, 709 short_side_lengths=self.short_side_lengths, 710 long_side_angles=self.long_side_angles, 711 ) 712 713 self.bounding_extended_text_region_masks = self.get_bounding_extended_text_region_masks( 714 shape=image.shape, 715 text_region_polygons=self.text_region_polygons, 716 dilated_text_region_polygons=self.dilated_text_region_polygons, 717 bounding_rectangular_polygons=self.bounding_rectangular_polygons, 718 typical_indices=self.typical_indices, 719 main_angles=self.main_angles, 720 ) 721 722 self.flattened_text_regions = self.build_flattened_text_regions( 723 image=image, 724 # NOTE: need to use the original text region polygons for reversed opts. 725 text_region_polygons=self.original_text_region_polygons, 726 bounding_extended_text_region_masks=self.bounding_extended_text_region_masks, 727 typical_indices=self.typical_indices, 728 flattening_rotate_angles=self.flattening_rotate_angles, 729 grouped_char_polygons=grouped_char_polygons, 730 )
233 @classmethod 234 def patch_text_region_polygons( 235 cls, 236 text_region_polygons: Sequence[Polygon], 237 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 238 ): 239 if grouped_char_polygons is None: 240 return text_region_polygons 241 242 assert len(text_region_polygons) == len(grouped_char_polygons) 243 244 patched_text_region_polygons: List[Polygon] = [] 245 for text_region_polygon, char_polygons in zip(text_region_polygons, grouped_char_polygons): 246 # Need to make sure all char polygons are included. 247 unionized_polygons = [text_region_polygon] 248 unionized_polygons.extend(char_polygons) 249 250 bounding_box = Box.from_boxes((polygon.bounding_box for polygon in unionized_polygons)) 251 mask = Mask.from_shapable(bounding_box).to_box_attached(bounding_box) 252 for polygon in unionized_polygons: 253 polygon.fill_mask(mask) 254 255 patched_text_region_polygons.append(mask.to_external_polygon()) 256 257 return patched_text_region_polygons
259 @classmethod 260 def get_dilated_and_bounding_rectangular_polygons( 261 cls, 262 text_region_polygon_dilate_ratio: float, 263 shape: Tuple[int, int], 264 text_region_polygons: Sequence[Polygon], 265 force_no_dilation_flags: Optional[Sequence[bool]] = None, 266 ): 267 dilated_text_region_polygons: List[Polygon] = [] 268 bounding_rectangular_polygons: List[Polygon] = [] 269 270 if force_no_dilation_flags is None: 271 force_no_dilation_flags_iter = itertools.repeat(False) 272 else: 273 assert len(force_no_dilation_flags) == len(text_region_polygons) 274 force_no_dilation_flags_iter = force_no_dilation_flags 275 276 for text_region_polygon, force_no_dilation_flag in zip( 277 text_region_polygons, force_no_dilation_flags_iter 278 ): 279 280 if not force_no_dilation_flag: 281 # Dilate. 282 text_region_polygon = text_region_polygon.to_dilated_polygon( 283 ratio=text_region_polygon_dilate_ratio, 284 ) 285 text_region_polygon = text_region_polygon.to_clipped_polygon(shape) 286 287 dilated_text_region_polygons.append(text_region_polygon) 288 bounding_rectangular_polygons.append( 289 text_region_polygon.to_bounding_rectangular_polygon(shape) 290 ) 291 292 return dilated_text_region_polygons, bounding_rectangular_polygons
294 @classmethod 295 def analyze_bounding_rectangular_polygons( 296 cls, 297 bounding_rectangular_polygons: Sequence[Polygon], 298 ): 299 short_side_lengths: List[float] = [] 300 long_side_ratios: List[float] = [] 301 long_side_angles: List[int] = [] 302 303 for polygon in bounding_rectangular_polygons: 304 # Get reference line. 305 point0, point1, _, point3 = polygon.points 306 side0_length = math.hypot( 307 point0.smooth_y - point1.smooth_y, 308 point0.smooth_x - point1.smooth_x, 309 ) 310 side1_length = math.hypot( 311 point0.smooth_y - point3.smooth_y, 312 point0.smooth_x - point3.smooth_x, 313 ) 314 315 # Get the short side length. 316 short_side_lengths.append(min(side0_length, side1_length)) 317 318 long_side_ratios.append( 319 max(side0_length, side1_length) / min(side0_length, side1_length) 320 ) 321 322 point_a = point0 323 if side0_length > side1_length: 324 # Reference line (p0 -> p1). 325 point_b = point1 326 else: 327 # Reference line (p0 -> p3). 328 point_b = point3 329 330 # Get the angle of reference line, in [0, 180) degree. 331 np_theta = np.arctan2( 332 point_a.smooth_y - point_b.smooth_y, 333 point_a.smooth_x - point_b.smooth_x, 334 ) 335 np_theta = np_theta % np.pi 336 long_side_angle = round(np_theta / np.pi * 180) % 180 337 long_side_angles.append(long_side_angle) 338 339 return short_side_lengths, long_side_ratios, long_side_angles
352 @classmethod 353 def check_first_text_region_polygon_is_larger( 354 cls, 355 text_region_polygons: Sequence[Polygon], 356 short_side_lengths: Sequence[float], 357 first_idx: int, 358 second_idx: int, 359 ): 360 first_text_region_polygon = text_region_polygons[first_idx] 361 second_text_region_polygon = text_region_polygons[second_idx] 362 363 # The short side indicates the text line height. 364 first_short_side_length = short_side_lengths[first_idx] 365 second_short_side_length = short_side_lengths[second_idx] 366 367 return ( 368 first_text_region_polygon.area >= second_text_region_polygon.area 369 and first_short_side_length >= second_short_side_length 370 )
372 @classmethod 373 def get_main_and_flattening_rotate_angles( 374 cls, 375 text_region_polygons: Sequence[Polygon], 376 typical_indices: Sequence[int], 377 short_side_lengths: Sequence[float], 378 long_side_angles: Sequence[int], 379 ): 380 typical_indices_set = set(typical_indices) 381 text_region_center_points = [ 382 text_region_polygon.get_center_point() for text_region_polygon in text_region_polygons 383 ] 384 385 main_angles: List[Optional[int]] = [None] * len(long_side_angles) 386 387 # 1. For typical indices, or if no typical indices. 388 for idx, long_side_angle in enumerate(long_side_angles): 389 if not typical_indices_set or idx in typical_indices_set: 390 main_angles[idx] = long_side_angle 391 392 # 2. For nontypcial indices. 393 if typical_indices_set: 394 typical_center_points = PointList( 395 text_region_center_points[idx] for idx in typical_indices 396 ) 397 kd_tree = KDTree(typical_center_points.to_np_array()) 398 399 nontypical_indices = tuple( 400 idx for idx, _ in enumerate(long_side_angles) if idx not in typical_indices_set 401 ) 402 nontypical_center_points = PointList( 403 text_region_center_points[idx] for idx in nontypical_indices 404 ) 405 406 # Set main angle as the closest typical angle. 407 # Round 1: Set if the closest typical polygon is large enough. 408 _, np_kd_nbr_indices = kd_tree.query(nontypical_center_points.to_np_array()) 409 round2_nontypical_indices: List[int] = [] 410 for nontypical_idx, typical_indices_idx in zip( 411 nontypical_indices, 412 np_kd_nbr_indices[:, 0].tolist(), 413 ): 414 typical_idx = typical_indices[typical_indices_idx] 415 if cls.check_first_text_region_polygon_is_larger( 416 text_region_polygons=text_region_polygons, 417 short_side_lengths=short_side_lengths, 418 first_idx=typical_idx, 419 second_idx=nontypical_idx, 420 ): 421 main_angles[nontypical_idx] = main_angles[typical_idx] 422 else: 423 round2_nontypical_indices.append(nontypical_idx) 424 425 # Round 2: Searching the closest typical polygon that has larger area. 426 round3_nontypical_indices: List[int] = [] 427 if round2_nontypical_indices: 428 round2_nontypical_center_points = PointList( 429 text_region_center_points[idx] for idx in round2_nontypical_indices 430 ) 431 _, np_kd_nbr_indices = kd_tree.query( 432 round2_nontypical_center_points.to_np_array(), 433 k=len(typical_center_points), 434 ) 435 for nontypical_idx, typical_indices_indices in zip( 436 round2_nontypical_indices, 437 np_kd_nbr_indices.tolist(), 438 ): 439 hit_typical_idx = None 440 for typical_indices_idx in typical_indices_indices: 441 typical_idx = typical_indices[typical_indices_idx] 442 if cls.check_first_text_region_polygon_is_larger( 443 text_region_polygons=text_region_polygons, 444 short_side_lengths=short_side_lengths, 445 first_idx=typical_idx, 446 second_idx=nontypical_idx, 447 ): 448 hit_typical_idx = typical_idx 449 break 450 451 if hit_typical_idx is not None: 452 main_angles[nontypical_idx] = main_angles[hit_typical_idx] 453 else: 454 round3_nontypical_indices.append(nontypical_idx) 455 456 # Round 3: Last resort. Set to the median of typical angles. 457 if round3_nontypical_indices: 458 main_angles_median = statistics.median_low( 459 long_side_angles[typical_idx] for typical_idx in typical_indices 460 ) 461 for nontypical_idx in round3_nontypical_indices: 462 main_angles[nontypical_idx] = main_angles_median 463 464 # 3. Get angle for flattening. 465 flattening_rotate_angles: List[int] = [] 466 for main_angle in main_angles: 467 assert main_angle is not None 468 if main_angle <= 90: 469 # [270, 360). 470 flattening_rotate_angle = (360 - main_angle) % 360 471 else: 472 # [1, 90). 473 flattening_rotate_angle = 180 - main_angle 474 flattening_rotate_angles.append(flattening_rotate_angle) 475 476 return cast(List[int], main_angles), flattening_rotate_angles
478 @classmethod 479 def get_bounding_extended_text_region_masks( 480 cls, 481 shape: Tuple[int, int], 482 text_region_polygons: Sequence[Polygon], 483 dilated_text_region_polygons: Sequence[Polygon], 484 bounding_rectangular_polygons: Sequence[Polygon], 485 typical_indices: Sequence[int], 486 main_angles: Sequence[int], 487 ): 488 typical_indices_set = set(typical_indices) 489 490 text_mask = Mask.from_polygons(shape, text_region_polygons) 491 non_text_mask = text_mask.to_inverted_mask() 492 493 box = Box.from_shape(shape) 494 text_mask = text_mask.to_box_attached(box) 495 non_text_mask = non_text_mask.to_box_attached(box) 496 497 bounding_extended_text_region_masks: List[Mask] = [] 498 499 num_text_region_polygons = len(text_region_polygons) 500 for idx in range(num_text_region_polygons): 501 text_region_polygon = text_region_polygons[idx] 502 dilated_text_region_polygon = dilated_text_region_polygons[idx] 503 bounding_rectangular_polygon = bounding_rectangular_polygons[idx] 504 505 if typical_indices_set and idx not in typical_indices_set: 506 # Patch bounding rectangular polygon if is nontypical. 507 main_angle = main_angles[idx] 508 bounding_rectangular_polygon = \ 509 dilated_text_region_polygon.to_bounding_rectangular_polygon( 510 shape=shape, 511 angle=main_angle, 512 ) 513 514 # See the comment in Polygon.to_bounding_rectangular_polygon. 515 bounding_box = Box.from_boxes(( 516 dilated_text_region_polygon.bounding_box, 517 bounding_rectangular_polygon.bounding_box, 518 )) 519 520 # Fill other text region. 521 bounding_other_text_mask = \ 522 Mask.from_shapable(bounding_box).to_box_attached(bounding_box) 523 # Copy from text mask. 524 bounding_rectangular_polygon.fill_mask(bounding_other_text_mask, text_mask) 525 # Use the original text region polygon to unset the current text mask. 526 text_region_polygon.fill_mask(bounding_other_text_mask, 0) 527 528 # Fill protentially dilated text region. 529 bounding_text_mask = \ 530 Mask.from_shapable(bounding_other_text_mask).to_box_attached(bounding_box) 531 # Use the protentially dilated text region polygon to set the current text mask. 532 dilated_text_region_polygon.fill_mask(bounding_text_mask, value=1) 533 534 del dilated_text_region_polygon 535 536 # Trim protentially dilated text region polygon by eliminating other text region. 537 bounding_trimmed_text_mask = Mask.from_masks( 538 bounding_box, 539 [ 540 # Includes the protentially dilated text region. 541 bounding_text_mask, 542 # But not includes any other text regions. 543 bounding_other_text_mask.to_inverted_mask(), 544 ], 545 ElementSetOperationMode.INTERSECT, 546 ) 547 548 # Extract non-text region. 549 bounding_non_text_mask = bounding_rectangular_polygon.extract_mask(non_text_mask) 550 551 # Unionize trimmed text region and non-text region. 552 bounding_extended_text_region_mask = Mask.from_masks( 553 bounding_box, 554 [bounding_trimmed_text_mask, bounding_non_text_mask], 555 ) 556 557 bounding_extended_text_region_masks.append(bounding_extended_text_region_mask) 558 559 return bounding_extended_text_region_masks
561 @classmethod 562 def build_flattened_text_regions( 563 cls, 564 image: Image, 565 text_region_polygons: Sequence[Polygon], 566 bounding_extended_text_region_masks: Sequence[Mask], 567 typical_indices: Sequence[int], 568 flattening_rotate_angles: Sequence[int], 569 grouped_char_polygons: Optional[Sequence[Sequence[Polygon]]], 570 ): 571 typical_indices_set = set(typical_indices) 572 573 flattened_text_regions: List[FlattenedTextRegion] = [] 574 575 for idx, ( 576 text_region_polygon, 577 bounding_extended_text_region_mask, 578 flattening_rotate_angle, 579 ) in enumerate( 580 zip( 581 text_region_polygons, 582 bounding_extended_text_region_masks, 583 flattening_rotate_angles, 584 ) 585 ): 586 bounding_box = bounding_extended_text_region_mask.box 587 assert bounding_box 588 589 # Extract image. 590 text_region_image = bounding_extended_text_region_mask.extract_image(image) 591 592 # Shift char polygons. 593 relative_char_polygons = None 594 if grouped_char_polygons is not None: 595 char_polygons = grouped_char_polygons[idx] 596 relative_char_polygons = [ 597 char_polygon.to_relative_polygon( 598 origin_y=bounding_box.up, 599 origin_x=bounding_box.left, 600 ) for char_polygon in char_polygons 601 ] 602 603 # Rotate. 604 rotated_result = rotate.distort( 605 {'angle': flattening_rotate_angle}, 606 image=text_region_image, 607 mask=bounding_extended_text_region_mask, 608 polygons=relative_char_polygons, 609 ) 610 rotated_text_region_image = rotated_result.image 611 assert rotated_text_region_image 612 rotated_bounding_extended_text_region_mask = rotated_result.mask 613 assert rotated_bounding_extended_text_region_mask 614 # Could be None. 615 rotated_char_polygons = rotated_result.polygons 616 617 # Trim. 618 rotated_trimmed_box = rotated_bounding_extended_text_region_mask.to_external_box() 619 620 trimmed_text_region_image = rotated_text_region_image.to_cropped_image( 621 up=rotated_trimmed_box.up, 622 down=rotated_trimmed_box.down, 623 left=rotated_trimmed_box.left, 624 right=rotated_trimmed_box.right, 625 ) 626 627 trimmed_mask = rotated_trimmed_box.extract_mask( 628 rotated_bounding_extended_text_region_mask 629 ) 630 631 trimmed_char_polygons = None 632 if rotated_char_polygons: 633 trimmed_char_polygons = [ 634 rotated_char_polygon.to_relative_polygon( 635 origin_y=rotated_trimmed_box.up, 636 origin_x=rotated_trimmed_box.left, 637 ) for rotated_char_polygon in rotated_char_polygons 638 ] 639 640 flattened_text_regions.append( 641 FlattenedTextRegion( 642 is_typical=(idx in typical_indices_set), 643 text_region_polygon=text_region_polygon, 644 text_region_image=bounding_extended_text_region_mask.extract_image(image), 645 bounding_extended_text_region_mask=bounding_extended_text_region_mask, 646 flattening_rotate_angle=flattening_rotate_angle, 647 shape_before_trim=rotated_text_region_image.shape, 648 rotated_trimmed_box=rotated_trimmed_box, 649 shape_before_resize=trimmed_text_region_image.shape, 650 post_rotate_angle=0, 651 flattened_image=trimmed_text_region_image, 652 flattened_mask=trimmed_mask, 653 flattened_char_polygons=trimmed_char_polygons, 654 ) 655 ) 656 657 return flattened_text_regions
733def build_background_image_for_stacking(height: int, width: int): 734 np_rgb_rows = [np.zeros((width, 3), dtype=np.uint8) for _ in range(3)] 735 rgb_tuples = [(255, 0, 0), (0, 255, 0), (0, 0, 255)] 736 737 for color_offset, np_row in enumerate(np_rgb_rows): 738 for color_idx in range(3): 739 color_tuple = rgb_tuples[(color_offset + color_idx) % 3] 740 np_row[color_idx::3] = color_tuple 741 742 np_image = np.zeros((height, width, 3), dtype=np.uint8) 743 for row_offset, np_row in enumerate(np_rgb_rows): 744 np_image[row_offset::3] = np_row 745 746 return Image(mat=np_image)
749def stack_flattened_text_regions( 750 page_pad: int, 751 flattened_text_regions_pad: int, 752 flattened_text_regions: Sequence[FlattenedTextRegion], 753): 754 page_double_pad = 2 * page_pad 755 flattened_text_regions_double_pad = 2 * flattened_text_regions_pad 756 757 rect_packer = RectPacker(rotation=False) 758 759 # Add box and bin. 760 # NOTE: Only one bin is added, that is, packing all text region into one image. 761 bin_width = 0 762 bin_height = 0 763 764 for ftr_idx, flattened_text_region in enumerate(flattened_text_regions): 765 rect_packer.add_rect( 766 width=flattened_text_region.width + flattened_text_regions_double_pad, 767 height=flattened_text_region.height + flattened_text_regions_double_pad, 768 rid=ftr_idx, 769 ) 770 771 bin_width = max(bin_width, flattened_text_region.width) 772 bin_height += flattened_text_region.height 773 774 bin_width += flattened_text_regions_double_pad 775 bin_height += flattened_text_regions_double_pad 776 777 rect_packer.add_bin(width=bin_width, height=bin_height) 778 779 # Pack boxes. 780 rect_packer.pack() # type: ignore 781 782 # Get packed boxes. 783 unordered_boxes: List[Box] = [] 784 ftr_indices: List[int] = [] 785 for bin_idx, x, y, width, height, ftr_idx in rect_packer.rect_list(): 786 assert bin_idx == 0 787 unordered_boxes.append(Box( 788 up=y, 789 down=y + height - 1, 790 left=x, 791 right=x + width - 1, 792 )) 793 ftr_indices.append(ftr_idx) 794 795 # Order boxes. 796 inverse_ftr_indices = [-1] * len(ftr_indices) 797 for inverse_ftr_idx, ftr_idx in enumerate(ftr_indices): 798 inverse_ftr_indices[ftr_idx] = inverse_ftr_idx 799 for inverse_ftr_idx in inverse_ftr_indices: 800 assert inverse_ftr_idx >= 0 801 padded_boxes = [unordered_boxes[inverse_ftr_idx] for inverse_ftr_idx in inverse_ftr_indices] 802 803 page_height = max(box.down for box in padded_boxes) + 1 + page_double_pad 804 page_width = max(box.right for box in padded_boxes) + 1 + page_double_pad 805 806 image = build_background_image_for_stacking(page_height, page_width) 807 active_mask = Mask.from_shapable(image) 808 text_region_boxes: List[Box] = [] 809 char_polygons: List[Polygon] = [] 810 char_polygon_text_region_box_indices: List[int] = [] 811 812 for padded_box, flattened_text_region in zip(padded_boxes, flattened_text_regions): 813 assert flattened_text_region.height + flattened_text_regions_double_pad \ 814 == padded_box.height 815 assert flattened_text_region.width + flattened_text_regions_double_pad \ 816 == padded_box.width 817 818 # Remove box padding. 819 up = padded_box.up + flattened_text_regions_pad + page_pad 820 left = padded_box.left + flattened_text_regions_pad + page_pad 821 822 text_region_box = Box( 823 up=up, 824 down=up + flattened_text_region.height - 1, 825 left=left, 826 right=left + flattened_text_region.width - 1, 827 ) 828 text_region_boxes.append(text_region_box) 829 text_region_box_idx = len(text_region_boxes) - 1 830 831 # Render. 832 text_region_box.fill_image( 833 image, 834 flattened_text_region.flattened_image, 835 image_mask=flattened_text_region.flattened_mask, 836 ) 837 text_region_box.fill_mask( 838 active_mask, 839 value=1, 840 mask_mask=flattened_text_region.flattened_mask, 841 ) 842 843 if flattened_text_region.flattened_char_polygons: 844 for char_polygon in flattened_text_region.flattened_char_polygons: 845 char_polygons.append(char_polygon.to_shifted_polygon( 846 offset_y=up, 847 offset_x=left, 848 )) 849 char_polygon_text_region_box_indices.append(text_region_box_idx) 850 851 return ( 852 image, 853 active_mask, 854 text_region_boxes, 855 char_polygons, 856 char_polygon_text_region_box_indices, 857 )
860class PageTextRegionStep( 861 PipelineStep[ 862 PageTextRegionStepConfig, 863 PageTextRegionStepInput, 864 PageTextRegionStepOutput, 865 ] 866): # yapf: disable 867 868 @classmethod 869 def generate_precise_text_region_candidate_polygons( 870 cls, 871 precise_mask: Mask, 872 disconnected_text_region_mask: Mask, 873 ): 874 assert precise_mask.box and disconnected_text_region_mask.box 875 876 # Get the intersection. 877 intersected_box = Box( 878 up=max(precise_mask.box.up, disconnected_text_region_mask.box.up), 879 down=min(precise_mask.box.down, disconnected_text_region_mask.box.down), 880 left=max(precise_mask.box.left, disconnected_text_region_mask.box.left), 881 right=min(precise_mask.box.right, disconnected_text_region_mask.box.right), 882 ) 883 assert intersected_box.up <= intersected_box.down 884 assert intersected_box.left <= intersected_box.right 885 886 precise_mask = intersected_box.extract_mask(precise_mask) 887 disconnected_text_region_mask = intersected_box.extract_mask(disconnected_text_region_mask) 888 889 # Apply mask bitwise-and operation. 890 intersected_mask = Mask( 891 mat=(disconnected_text_region_mask.mat & precise_mask.mat).astype(np.uint8) 892 ) 893 intersected_mask = intersected_mask.to_box_attached(intersected_box) 894 895 # NOTE: 896 # 1. Could extract more than one polygons. 897 # 2. Some polygons are in border and should be removed later. 898 return intersected_mask.to_disconnected_polygons() 899 900 @classmethod 901 def strtree_query_intersected_polygons( 902 cls, 903 strtree: STRtree, 904 anchor_polygons: Sequence[Polygon], 905 candidate_polygon: Polygon, 906 ): 907 candidate_shapely_polygon = candidate_polygon.to_shapely_polygon() 908 candidate_mask = candidate_polygon.mask 909 910 for anchor_idx in sorted(strtree.query(candidate_shapely_polygon)): 911 anchor_polygon = anchor_polygons[anchor_idx] 912 anchor_mask = anchor_polygon.mask 913 914 intersected_ratio = calculate_boxed_masks_intersected_ratio( 915 anchor_mask=anchor_mask, 916 candidate_mask=candidate_mask, 917 use_candidate_as_base=True, 918 ) 919 920 yield ( 921 anchor_idx, 922 anchor_polygon, 923 anchor_mask, 924 candidate_mask, 925 intersected_ratio, 926 ) 927 928 def sample_page_non_text_region_polygons( 929 self, 930 page_non_text_region_polygons: Sequence[Polygon], 931 num_page_text_region_infos: int, 932 rng: RandomGenerator, 933 ): 934 negative_ratio = self.config.negative_text_region_ratio 935 num_page_non_text_region_polygons = round( 936 negative_ratio * num_page_text_region_infos / (1 - negative_ratio) 937 ) 938 return rng_choice_with_size( 939 rng, 940 page_non_text_region_polygons, 941 size=min( 942 num_page_non_text_region_polygons, 943 len(page_non_text_region_polygons), 944 ), 945 replace=False, 946 ) 947 948 def build_flattened_text_regions( 949 self, 950 page_image: Image, 951 page_text_region_infos: Sequence[PageTextRegionInfo], 952 page_non_text_region_polygons: Sequence[Polygon], 953 rng: RandomGenerator, 954 ): 955 text_region_polygon_dilate_ratio = float( 956 rng.uniform( 957 self.config.text_region_flattener_text_region_polygon_dilate_ratio_min, 958 self.config.text_region_flattener_text_region_polygon_dilate_ratio_max, 959 ) 960 ) 961 typical_long_side_ratio_min = \ 962 self.config.text_region_flattener_typical_long_side_ratio_min 963 964 text_region_polygons: List[Polygon] = [] 965 grouped_char_polygons: List[Sequence[Polygon]] = [] 966 for page_text_region_info in page_text_region_infos: 967 text_region_polygons.append(page_text_region_info.precise_text_region_polygon) 968 grouped_char_polygons.append(page_text_region_info.char_polygons) 969 970 # Inject nagative regions. 971 for page_non_text_region_polygon in page_non_text_region_polygons: 972 # NOTE: Don't drop any text region here, otherwise will introduce labeling confusion, 973 # since dropped text region will be considered as non-text region. 974 text_region_polygons.append(page_non_text_region_polygon) 975 grouped_char_polygons.append(tuple()) 976 977 text_region_flattener = TextRegionFlattener( 978 typical_long_side_ratio_min=typical_long_side_ratio_min, 979 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 980 image=page_image, 981 text_region_polygons=text_region_polygons, 982 grouped_char_polygons=grouped_char_polygons, 983 is_training=True, 984 ) 985 986 # Resize positive ftr. 987 positive_flattened_text_regions: List[FlattenedTextRegion] = [] 988 # For negative sampling. 989 positive_reference_heights: List[float] = [] 990 positive_reference_widths: List[float] = [] 991 num_negative_flattened_text_regions = 0 992 993 for flattened_text_region in text_region_flattener.flattened_text_regions: 994 if not flattened_text_region.flattened_char_polygons: 995 num_negative_flattened_text_regions += 1 996 continue 997 998 if len(flattened_text_region.flattened_char_polygons) == 1 \ 999 and rng.random() < self.config.prob_drop_single_char_page_text_region_info: 1000 # Ignore some single-char text region for reducing label confusion. 1001 continue 1002 1003 char_height_median = flattened_text_region.get_char_height_meidan() 1004 1005 text_region_resize_char_height_median = int( 1006 rng.integers( 1007 self.config.text_region_resize_char_height_median_min, 1008 self.config.text_region_resize_char_height_median_max + 1, 1009 ) 1010 ) 1011 scale = text_region_resize_char_height_median / char_height_median 1012 1013 height, width = flattened_text_region.shape 1014 resized_height = round(height * scale) 1015 resized_width = round(width * scale) 1016 1017 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 1018 resized_height=resized_height, 1019 resized_width=resized_width, 1020 ) 1021 1022 positive_reference_heights.append(resized_height) 1023 positive_reference_widths.append(resized_width) 1024 1025 # Post rotate. 1026 post_rotate_angle = 0 1027 if flattened_text_region.is_typical: 1028 if rng.random() < self.config.prob_text_region_typical_post_rotate: 1029 # Upside down only. 1030 post_rotate_angle = 180 1031 else: 1032 if rng.random() < self.config.prob_text_region_untypical_post_rotate: 1033 # 3-way rotate. 1034 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 1035 1036 if post_rotate_angle != 0: 1037 flattened_text_region = \ 1038 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 1039 1040 positive_flattened_text_regions.append(flattened_text_region) 1041 1042 # Resize negative ftr. 1043 negative_reference_heights = list( 1044 rng_choice_with_size( 1045 rng, 1046 positive_reference_heights, 1047 size=num_negative_flattened_text_regions, 1048 replace=(num_negative_flattened_text_regions > len(positive_reference_heights)), 1049 ) 1050 ) 1051 1052 negative_height_max = max(positive_reference_heights) 1053 negative_width_max = max(positive_reference_widths) 1054 1055 negative_flattened_text_regions: List[FlattenedTextRegion] = [] 1056 1057 for flattened_text_region in text_region_flattener.flattened_text_regions: 1058 if flattened_text_region.flattened_char_polygons: 1059 continue 1060 1061 reference_height = negative_reference_heights.pop() 1062 scale = reference_height / flattened_text_region.height 1063 1064 height, width = flattened_text_region.shape 1065 resized_height = round(height * scale) 1066 resized_width = round(width * scale) 1067 1068 # Remove negative region that is too large. 1069 if resized_height > negative_height_max or resized_width > negative_width_max: 1070 continue 1071 1072 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 1073 resized_height=resized_height, 1074 resized_width=resized_width, 1075 ) 1076 1077 # Post rotate. 1078 post_rotate_angle = 0 1079 if flattened_text_region.is_typical: 1080 if rng.random() < self.config.prob_text_region_typical_post_rotate: 1081 # Upside down only. 1082 post_rotate_angle = 180 1083 else: 1084 if rng.random() < self.config.prob_text_region_untypical_post_rotate: 1085 # 3-way rotate. 1086 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 1087 1088 if post_rotate_angle != 0: 1089 flattened_text_region = \ 1090 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 1091 1092 negative_flattened_text_regions.append(flattened_text_region) 1093 1094 flattened_text_regions = ( 1095 *positive_flattened_text_regions, 1096 *negative_flattened_text_regions, 1097 ) 1098 return flattened_text_regions 1099 1100 def run(self, input: PageTextRegionStepInput, rng: RandomGenerator): 1101 page_distortion_step_output = input.page_distortion_step_output 1102 page_image = page_distortion_step_output.page_image 1103 page_char_polygon_collection = page_distortion_step_output.page_char_polygon_collection 1104 page_disconnected_text_region_collection = \ 1105 page_distortion_step_output.page_disconnected_text_region_collection 1106 page_non_text_region_collection = \ 1107 page_distortion_step_output.page_non_text_region_collection 1108 1109 page_resizing_step_output = input.page_resizing_step_output 1110 page_resized_text_line_mask = page_resizing_step_output.page_text_line_mask 1111 1112 debug = None 1113 if self.config.enable_debug: 1114 debug = PageTextRegionStepDebug() 1115 1116 # Build R-tree to track text regions. 1117 disconnected_text_region_polygons: List[Polygon] = [] 1118 disconnected_text_region_shapely_polygons: List[ShapelyPolygon] = [] 1119 for polygon in page_disconnected_text_region_collection.to_polygons(): 1120 disconnected_text_region_polygons.append(polygon) 1121 shapely_polygon = polygon.to_shapely_polygon() 1122 disconnected_text_region_shapely_polygons.append(shapely_polygon) 1123 1124 disconnected_text_region_tree = STRtree(disconnected_text_region_shapely_polygons) 1125 1126 # Get the precise text regions. 1127 precise_text_region_candidate_polygons: List[Polygon] = [] 1128 for resized_precise_polygon in page_resized_text_line_mask.to_disconnected_polygons(): 1129 # Resize back to the shape after distortion. 1130 precise_polygon = resized_precise_polygon.to_conducted_resized_polygon( 1131 page_resized_text_line_mask, 1132 resized_height=page_image.height, 1133 resized_width=page_image.width, 1134 ) 1135 1136 # Find and extract intersected text region. 1137 # NOTE: One precise_polygon could be overlapped with 1138 # more than one disconnected_text_region_polygon! 1139 for _, _, disconnected_text_region_mask, precise_mask, _ in \ 1140 self.strtree_query_intersected_polygons( 1141 strtree=disconnected_text_region_tree, 1142 anchor_polygons=disconnected_text_region_polygons, 1143 candidate_polygon=precise_polygon, 1144 ): 1145 precise_text_region_candidate_polygons.extend( 1146 self.generate_precise_text_region_candidate_polygons( 1147 precise_mask=precise_mask, 1148 disconnected_text_region_mask=disconnected_text_region_mask, 1149 ) 1150 ) 1151 1152 if debug: 1153 debug.page_image = page_image 1154 debug.precise_text_region_candidate_polygons = precise_text_region_candidate_polygons 1155 1156 # Help gc. 1157 del disconnected_text_region_polygons 1158 del disconnected_text_region_shapely_polygons 1159 del disconnected_text_region_tree 1160 1161 # Bind char-level polygon to precise text region. 1162 precise_text_region_polygons: List[Polygon] = [] 1163 precise_text_region_shapely_polygons: List[ShapelyPolygon] = [] 1164 1165 for polygon in precise_text_region_candidate_polygons: 1166 precise_text_region_polygons.append(polygon) 1167 shapely_polygon = polygon.to_shapely_polygon() 1168 precise_text_region_shapely_polygons.append(shapely_polygon) 1169 1170 precise_text_region_tree = STRtree(precise_text_region_shapely_polygons) 1171 1172 if not self.config.use_adjusted_char_polygons: 1173 selected_char_polygons = page_char_polygon_collection.char_polygons 1174 else: 1175 selected_char_polygons = page_char_polygon_collection.adjusted_char_polygons 1176 1177 ptrp_idx_to_char_polygons: DefaultDict[int, List[Polygon]] = defaultdict(list) 1178 1179 for char_polygon in selected_char_polygons: 1180 best_precise_text_region_polygon_idx = None 1181 intersected_ratio_max = 0 1182 1183 for ( 1184 precise_text_region_polygon_idx, 1185 _, 1186 _, 1187 _, 1188 intersected_ratio, 1189 ) in self.strtree_query_intersected_polygons( 1190 strtree=precise_text_region_tree, 1191 anchor_polygons=precise_text_region_polygons, 1192 candidate_polygon=char_polygon, 1193 ): 1194 if intersected_ratio > intersected_ratio_max: 1195 intersected_ratio_max = intersected_ratio 1196 best_precise_text_region_polygon_idx = precise_text_region_polygon_idx 1197 1198 if best_precise_text_region_polygon_idx is not None: 1199 ptrp_idx_to_char_polygons[best_precise_text_region_polygon_idx].append(char_polygon) 1200 else: 1201 # NOTE: Text line with only a small char (i.e. delimiter) could enter this branch. 1202 # In such case, the text line bounding box is smaller than the char polygon, since 1203 # the leading/trailing char paddings are ignored during text line rendering. 1204 # It's acceptable for now since: 1) this case happens rarely, 2) and it won't 1205 # introduce labeling noise. 1206 logger.warning(f'Cannot assign a text region for char_polygon={char_polygon}') 1207 1208 page_text_region_infos: List[PageTextRegionInfo] = [] 1209 for ptrp_idx, precise_text_region_polygon in enumerate(precise_text_region_polygons): 1210 if ptrp_idx not in ptrp_idx_to_char_polygons: 1211 continue 1212 page_text_region_infos.append( 1213 PageTextRegionInfo( 1214 precise_text_region_polygon=precise_text_region_polygon, 1215 char_polygons=ptrp_idx_to_char_polygons[ptrp_idx], 1216 ) 1217 ) 1218 1219 # Help gc. 1220 del precise_text_region_polygons 1221 del precise_text_region_shapely_polygons 1222 del precise_text_region_tree 1223 1224 if debug: 1225 debug.page_text_region_infos = page_text_region_infos 1226 1227 # Negative sampling. 1228 page_non_text_region_polygons = self.sample_page_non_text_region_polygons( 1229 page_non_text_region_polygons=tuple(page_non_text_region_collection.to_polygons()), 1230 num_page_text_region_infos=len(page_text_region_infos), 1231 rng=rng, 1232 ) 1233 1234 flattened_text_regions = self.build_flattened_text_regions( 1235 page_image=page_image, 1236 page_text_region_infos=page_text_region_infos, 1237 page_non_text_region_polygons=page_non_text_region_polygons, 1238 rng=rng, 1239 ) 1240 if debug: 1241 debug.flattened_text_regions = flattened_text_regions 1242 1243 # Stack text regions. 1244 ( 1245 image, 1246 active_mask, 1247 text_region_boxes, 1248 char_polygons, 1249 char_polygon_text_region_box_indices, 1250 ) = stack_flattened_text_regions( 1251 page_pad=0, 1252 flattened_text_regions_pad=self.config.stack_flattened_text_regions_pad, 1253 flattened_text_regions=flattened_text_regions, 1254 ) 1255 1256 text_region_polygons = [ 1257 text_region_box.to_polygon() for text_region_box in text_region_boxes 1258 ] 1259 1260 # Post uniform rotation. 1261 shape_before_rotate = image.shape 1262 rotate_angle = 0 1263 1264 if rng.random() < self.config.prob_post_rotate_90_angle: 1265 rotate_angle = 90 1266 1267 if rng.random() < self.config.prob_post_rotate_random_angle: 1268 rotate_angle += int( 1269 rng.integers( 1270 self.config.post_rotate_random_angle_min, 1271 self.config.post_rotate_random_angle_max + 1, 1272 ) 1273 ) 1274 1275 if rotate_angle != 0: 1276 # For unpacking. 1277 num_char_polygons = len(char_polygons) 1278 rotated_result = rotate.distort( 1279 {'angle': rotate_angle}, 1280 image=image, 1281 mask=active_mask, 1282 polygons=(*char_polygons, *text_region_polygons), 1283 ) 1284 assert rotated_result.image and rotated_result.mask and rotated_result.polygons 1285 image = rotated_result.image 1286 active_mask = rotated_result.mask 1287 char_polygons = rotated_result.polygons[:num_char_polygons] 1288 text_region_polygons = rotated_result.polygons[num_char_polygons:] 1289 1290 return PageTextRegionStepOutput( 1291 page_image=image, 1292 page_active_mask=active_mask, 1293 page_char_polygons=char_polygons, 1294 page_text_region_polygons=text_region_polygons, 1295 page_char_polygon_text_region_polygon_indices=char_polygon_text_region_box_indices, 1296 shape_before_rotate=shape_before_rotate, 1297 rotate_angle=rotate_angle, 1298 debug=debug, 1299 )
Abstract base class for generic types.
A generic type is typically declared by inheriting from this class parameterized with one or more type variables. For example, a generic mapping type might be defined as::
class Mapping(Generic[KT, VT]): def __getitem__(self, key: KT) -> VT: ... # Etc.
This class can then be used as follows::
def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT: try: return mapping[key] except KeyError: return default
868 @classmethod 869 def generate_precise_text_region_candidate_polygons( 870 cls, 871 precise_mask: Mask, 872 disconnected_text_region_mask: Mask, 873 ): 874 assert precise_mask.box and disconnected_text_region_mask.box 875 876 # Get the intersection. 877 intersected_box = Box( 878 up=max(precise_mask.box.up, disconnected_text_region_mask.box.up), 879 down=min(precise_mask.box.down, disconnected_text_region_mask.box.down), 880 left=max(precise_mask.box.left, disconnected_text_region_mask.box.left), 881 right=min(precise_mask.box.right, disconnected_text_region_mask.box.right), 882 ) 883 assert intersected_box.up <= intersected_box.down 884 assert intersected_box.left <= intersected_box.right 885 886 precise_mask = intersected_box.extract_mask(precise_mask) 887 disconnected_text_region_mask = intersected_box.extract_mask(disconnected_text_region_mask) 888 889 # Apply mask bitwise-and operation. 890 intersected_mask = Mask( 891 mat=(disconnected_text_region_mask.mat & precise_mask.mat).astype(np.uint8) 892 ) 893 intersected_mask = intersected_mask.to_box_attached(intersected_box) 894 895 # NOTE: 896 # 1. Could extract more than one polygons. 897 # 2. Some polygons are in border and should be removed later. 898 return intersected_mask.to_disconnected_polygons()
900 @classmethod 901 def strtree_query_intersected_polygons( 902 cls, 903 strtree: STRtree, 904 anchor_polygons: Sequence[Polygon], 905 candidate_polygon: Polygon, 906 ): 907 candidate_shapely_polygon = candidate_polygon.to_shapely_polygon() 908 candidate_mask = candidate_polygon.mask 909 910 for anchor_idx in sorted(strtree.query(candidate_shapely_polygon)): 911 anchor_polygon = anchor_polygons[anchor_idx] 912 anchor_mask = anchor_polygon.mask 913 914 intersected_ratio = calculate_boxed_masks_intersected_ratio( 915 anchor_mask=anchor_mask, 916 candidate_mask=candidate_mask, 917 use_candidate_as_base=True, 918 ) 919 920 yield ( 921 anchor_idx, 922 anchor_polygon, 923 anchor_mask, 924 candidate_mask, 925 intersected_ratio, 926 )
928 def sample_page_non_text_region_polygons( 929 self, 930 page_non_text_region_polygons: Sequence[Polygon], 931 num_page_text_region_infos: int, 932 rng: RandomGenerator, 933 ): 934 negative_ratio = self.config.negative_text_region_ratio 935 num_page_non_text_region_polygons = round( 936 negative_ratio * num_page_text_region_infos / (1 - negative_ratio) 937 ) 938 return rng_choice_with_size( 939 rng, 940 page_non_text_region_polygons, 941 size=min( 942 num_page_non_text_region_polygons, 943 len(page_non_text_region_polygons), 944 ), 945 replace=False, 946 )
948 def build_flattened_text_regions( 949 self, 950 page_image: Image, 951 page_text_region_infos: Sequence[PageTextRegionInfo], 952 page_non_text_region_polygons: Sequence[Polygon], 953 rng: RandomGenerator, 954 ): 955 text_region_polygon_dilate_ratio = float( 956 rng.uniform( 957 self.config.text_region_flattener_text_region_polygon_dilate_ratio_min, 958 self.config.text_region_flattener_text_region_polygon_dilate_ratio_max, 959 ) 960 ) 961 typical_long_side_ratio_min = \ 962 self.config.text_region_flattener_typical_long_side_ratio_min 963 964 text_region_polygons: List[Polygon] = [] 965 grouped_char_polygons: List[Sequence[Polygon]] = [] 966 for page_text_region_info in page_text_region_infos: 967 text_region_polygons.append(page_text_region_info.precise_text_region_polygon) 968 grouped_char_polygons.append(page_text_region_info.char_polygons) 969 970 # Inject nagative regions. 971 for page_non_text_region_polygon in page_non_text_region_polygons: 972 # NOTE: Don't drop any text region here, otherwise will introduce labeling confusion, 973 # since dropped text region will be considered as non-text region. 974 text_region_polygons.append(page_non_text_region_polygon) 975 grouped_char_polygons.append(tuple()) 976 977 text_region_flattener = TextRegionFlattener( 978 typical_long_side_ratio_min=typical_long_side_ratio_min, 979 text_region_polygon_dilate_ratio=text_region_polygon_dilate_ratio, 980 image=page_image, 981 text_region_polygons=text_region_polygons, 982 grouped_char_polygons=grouped_char_polygons, 983 is_training=True, 984 ) 985 986 # Resize positive ftr. 987 positive_flattened_text_regions: List[FlattenedTextRegion] = [] 988 # For negative sampling. 989 positive_reference_heights: List[float] = [] 990 positive_reference_widths: List[float] = [] 991 num_negative_flattened_text_regions = 0 992 993 for flattened_text_region in text_region_flattener.flattened_text_regions: 994 if not flattened_text_region.flattened_char_polygons: 995 num_negative_flattened_text_regions += 1 996 continue 997 998 if len(flattened_text_region.flattened_char_polygons) == 1 \ 999 and rng.random() < self.config.prob_drop_single_char_page_text_region_info: 1000 # Ignore some single-char text region for reducing label confusion. 1001 continue 1002 1003 char_height_median = flattened_text_region.get_char_height_meidan() 1004 1005 text_region_resize_char_height_median = int( 1006 rng.integers( 1007 self.config.text_region_resize_char_height_median_min, 1008 self.config.text_region_resize_char_height_median_max + 1, 1009 ) 1010 ) 1011 scale = text_region_resize_char_height_median / char_height_median 1012 1013 height, width = flattened_text_region.shape 1014 resized_height = round(height * scale) 1015 resized_width = round(width * scale) 1016 1017 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 1018 resized_height=resized_height, 1019 resized_width=resized_width, 1020 ) 1021 1022 positive_reference_heights.append(resized_height) 1023 positive_reference_widths.append(resized_width) 1024 1025 # Post rotate. 1026 post_rotate_angle = 0 1027 if flattened_text_region.is_typical: 1028 if rng.random() < self.config.prob_text_region_typical_post_rotate: 1029 # Upside down only. 1030 post_rotate_angle = 180 1031 else: 1032 if rng.random() < self.config.prob_text_region_untypical_post_rotate: 1033 # 3-way rotate. 1034 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 1035 1036 if post_rotate_angle != 0: 1037 flattened_text_region = \ 1038 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 1039 1040 positive_flattened_text_regions.append(flattened_text_region) 1041 1042 # Resize negative ftr. 1043 negative_reference_heights = list( 1044 rng_choice_with_size( 1045 rng, 1046 positive_reference_heights, 1047 size=num_negative_flattened_text_regions, 1048 replace=(num_negative_flattened_text_regions > len(positive_reference_heights)), 1049 ) 1050 ) 1051 1052 negative_height_max = max(positive_reference_heights) 1053 negative_width_max = max(positive_reference_widths) 1054 1055 negative_flattened_text_regions: List[FlattenedTextRegion] = [] 1056 1057 for flattened_text_region in text_region_flattener.flattened_text_regions: 1058 if flattened_text_region.flattened_char_polygons: 1059 continue 1060 1061 reference_height = negative_reference_heights.pop() 1062 scale = reference_height / flattened_text_region.height 1063 1064 height, width = flattened_text_region.shape 1065 resized_height = round(height * scale) 1066 resized_width = round(width * scale) 1067 1068 # Remove negative region that is too large. 1069 if resized_height > negative_height_max or resized_width > negative_width_max: 1070 continue 1071 1072 flattened_text_region = flattened_text_region.to_resized_flattened_text_region( 1073 resized_height=resized_height, 1074 resized_width=resized_width, 1075 ) 1076 1077 # Post rotate. 1078 post_rotate_angle = 0 1079 if flattened_text_region.is_typical: 1080 if rng.random() < self.config.prob_text_region_typical_post_rotate: 1081 # Upside down only. 1082 post_rotate_angle = 180 1083 else: 1084 if rng.random() < self.config.prob_text_region_untypical_post_rotate: 1085 # 3-way rotate. 1086 post_rotate_angle = rng_choice(rng, (180, 90, 270), probs=(0.5, 0.25, 0.25)) 1087 1088 if post_rotate_angle != 0: 1089 flattened_text_region = \ 1090 flattened_text_region.to_post_rotated_flattened_text_region(post_rotate_angle) 1091 1092 negative_flattened_text_regions.append(flattened_text_region) 1093 1094 flattened_text_regions = ( 1095 *positive_flattened_text_regions, 1096 *negative_flattened_text_regions, 1097 ) 1098 return flattened_text_regions
1100 def run(self, input: PageTextRegionStepInput, rng: RandomGenerator): 1101 page_distortion_step_output = input.page_distortion_step_output 1102 page_image = page_distortion_step_output.page_image 1103 page_char_polygon_collection = page_distortion_step_output.page_char_polygon_collection 1104 page_disconnected_text_region_collection = \ 1105 page_distortion_step_output.page_disconnected_text_region_collection 1106 page_non_text_region_collection = \ 1107 page_distortion_step_output.page_non_text_region_collection 1108 1109 page_resizing_step_output = input.page_resizing_step_output 1110 page_resized_text_line_mask = page_resizing_step_output.page_text_line_mask 1111 1112 debug = None 1113 if self.config.enable_debug: 1114 debug = PageTextRegionStepDebug() 1115 1116 # Build R-tree to track text regions. 1117 disconnected_text_region_polygons: List[Polygon] = [] 1118 disconnected_text_region_shapely_polygons: List[ShapelyPolygon] = [] 1119 for polygon in page_disconnected_text_region_collection.to_polygons(): 1120 disconnected_text_region_polygons.append(polygon) 1121 shapely_polygon = polygon.to_shapely_polygon() 1122 disconnected_text_region_shapely_polygons.append(shapely_polygon) 1123 1124 disconnected_text_region_tree = STRtree(disconnected_text_region_shapely_polygons) 1125 1126 # Get the precise text regions. 1127 precise_text_region_candidate_polygons: List[Polygon] = [] 1128 for resized_precise_polygon in page_resized_text_line_mask.to_disconnected_polygons(): 1129 # Resize back to the shape after distortion. 1130 precise_polygon = resized_precise_polygon.to_conducted_resized_polygon( 1131 page_resized_text_line_mask, 1132 resized_height=page_image.height, 1133 resized_width=page_image.width, 1134 ) 1135 1136 # Find and extract intersected text region. 1137 # NOTE: One precise_polygon could be overlapped with 1138 # more than one disconnected_text_region_polygon! 1139 for _, _, disconnected_text_region_mask, precise_mask, _ in \ 1140 self.strtree_query_intersected_polygons( 1141 strtree=disconnected_text_region_tree, 1142 anchor_polygons=disconnected_text_region_polygons, 1143 candidate_polygon=precise_polygon, 1144 ): 1145 precise_text_region_candidate_polygons.extend( 1146 self.generate_precise_text_region_candidate_polygons( 1147 precise_mask=precise_mask, 1148 disconnected_text_region_mask=disconnected_text_region_mask, 1149 ) 1150 ) 1151 1152 if debug: 1153 debug.page_image = page_image 1154 debug.precise_text_region_candidate_polygons = precise_text_region_candidate_polygons 1155 1156 # Help gc. 1157 del disconnected_text_region_polygons 1158 del disconnected_text_region_shapely_polygons 1159 del disconnected_text_region_tree 1160 1161 # Bind char-level polygon to precise text region. 1162 precise_text_region_polygons: List[Polygon] = [] 1163 precise_text_region_shapely_polygons: List[ShapelyPolygon] = [] 1164 1165 for polygon in precise_text_region_candidate_polygons: 1166 precise_text_region_polygons.append(polygon) 1167 shapely_polygon = polygon.to_shapely_polygon() 1168 precise_text_region_shapely_polygons.append(shapely_polygon) 1169 1170 precise_text_region_tree = STRtree(precise_text_region_shapely_polygons) 1171 1172 if not self.config.use_adjusted_char_polygons: 1173 selected_char_polygons = page_char_polygon_collection.char_polygons 1174 else: 1175 selected_char_polygons = page_char_polygon_collection.adjusted_char_polygons 1176 1177 ptrp_idx_to_char_polygons: DefaultDict[int, List[Polygon]] = defaultdict(list) 1178 1179 for char_polygon in selected_char_polygons: 1180 best_precise_text_region_polygon_idx = None 1181 intersected_ratio_max = 0 1182 1183 for ( 1184 precise_text_region_polygon_idx, 1185 _, 1186 _, 1187 _, 1188 intersected_ratio, 1189 ) in self.strtree_query_intersected_polygons( 1190 strtree=precise_text_region_tree, 1191 anchor_polygons=precise_text_region_polygons, 1192 candidate_polygon=char_polygon, 1193 ): 1194 if intersected_ratio > intersected_ratio_max: 1195 intersected_ratio_max = intersected_ratio 1196 best_precise_text_region_polygon_idx = precise_text_region_polygon_idx 1197 1198 if best_precise_text_region_polygon_idx is not None: 1199 ptrp_idx_to_char_polygons[best_precise_text_region_polygon_idx].append(char_polygon) 1200 else: 1201 # NOTE: Text line with only a small char (i.e. delimiter) could enter this branch. 1202 # In such case, the text line bounding box is smaller than the char polygon, since 1203 # the leading/trailing char paddings are ignored during text line rendering. 1204 # It's acceptable for now since: 1) this case happens rarely, 2) and it won't 1205 # introduce labeling noise. 1206 logger.warning(f'Cannot assign a text region for char_polygon={char_polygon}') 1207 1208 page_text_region_infos: List[PageTextRegionInfo] = [] 1209 for ptrp_idx, precise_text_region_polygon in enumerate(precise_text_region_polygons): 1210 if ptrp_idx not in ptrp_idx_to_char_polygons: 1211 continue 1212 page_text_region_infos.append( 1213 PageTextRegionInfo( 1214 precise_text_region_polygon=precise_text_region_polygon, 1215 char_polygons=ptrp_idx_to_char_polygons[ptrp_idx], 1216 ) 1217 ) 1218 1219 # Help gc. 1220 del precise_text_region_polygons 1221 del precise_text_region_shapely_polygons 1222 del precise_text_region_tree 1223 1224 if debug: 1225 debug.page_text_region_infos = page_text_region_infos 1226 1227 # Negative sampling. 1228 page_non_text_region_polygons = self.sample_page_non_text_region_polygons( 1229 page_non_text_region_polygons=tuple(page_non_text_region_collection.to_polygons()), 1230 num_page_text_region_infos=len(page_text_region_infos), 1231 rng=rng, 1232 ) 1233 1234 flattened_text_regions = self.build_flattened_text_regions( 1235 page_image=page_image, 1236 page_text_region_infos=page_text_region_infos, 1237 page_non_text_region_polygons=page_non_text_region_polygons, 1238 rng=rng, 1239 ) 1240 if debug: 1241 debug.flattened_text_regions = flattened_text_regions 1242 1243 # Stack text regions. 1244 ( 1245 image, 1246 active_mask, 1247 text_region_boxes, 1248 char_polygons, 1249 char_polygon_text_region_box_indices, 1250 ) = stack_flattened_text_regions( 1251 page_pad=0, 1252 flattened_text_regions_pad=self.config.stack_flattened_text_regions_pad, 1253 flattened_text_regions=flattened_text_regions, 1254 ) 1255 1256 text_region_polygons = [ 1257 text_region_box.to_polygon() for text_region_box in text_region_boxes 1258 ] 1259 1260 # Post uniform rotation. 1261 shape_before_rotate = image.shape 1262 rotate_angle = 0 1263 1264 if rng.random() < self.config.prob_post_rotate_90_angle: 1265 rotate_angle = 90 1266 1267 if rng.random() < self.config.prob_post_rotate_random_angle: 1268 rotate_angle += int( 1269 rng.integers( 1270 self.config.post_rotate_random_angle_min, 1271 self.config.post_rotate_random_angle_max + 1, 1272 ) 1273 ) 1274 1275 if rotate_angle != 0: 1276 # For unpacking. 1277 num_char_polygons = len(char_polygons) 1278 rotated_result = rotate.distort( 1279 {'angle': rotate_angle}, 1280 image=image, 1281 mask=active_mask, 1282 polygons=(*char_polygons, *text_region_polygons), 1283 ) 1284 assert rotated_result.image and rotated_result.mask and rotated_result.polygons 1285 image = rotated_result.image 1286 active_mask = rotated_result.mask 1287 char_polygons = rotated_result.polygons[:num_char_polygons] 1288 text_region_polygons = rotated_result.polygons[num_char_polygons:] 1289 1290 return PageTextRegionStepOutput( 1291 page_image=image, 1292 page_active_mask=active_mask, 1293 page_char_polygons=char_polygons, 1294 page_text_region_polygons=text_region_polygons, 1295 page_char_polygon_text_region_polygon_indices=char_polygon_text_region_box_indices, 1296 shape_before_rotate=shape_before_rotate, 1297 rotate_angle=rotate_angle, 1298 debug=debug, 1299 )