fix(parser): recover stolen scope text and truncate next-standard bleed

Add Pass 3 to recover scope text incorrectly placed in previous block, and Pass 4 to truncate bleed from the following standard. Regenerate standards.json and standards_chunks.json with the improved parser.
2026-05-04 00:18:17 +05:30
parent 80aa252c3e
commit 28bb4ca1de
3 changed files with 2964 additions and 3276 deletions
@@ -86,6 +86,17 @@ SECTION_CATEGORIES = {
 }
 # Sub-category headings that appear inside sections (e.g. "CEMENT", "AGGREGATES")
 # Block-parsing patterns used in split_into_standards Pass 3 / Pass 4
 _SCOPE_START_RE = re.compile(r"1\s*[\.\)]\s*Scope", re.IGNORECASE)
 _BODY_SECTION_RE = re.compile(r"[2-9]\s*[\.\)]\s+[A-Z]")
 _SCOPE2_RE = re.compile(r"\n\s*1\s*[\.\)]\s*Scope", re.IGNORECASE)
 # Minimum chars that must precede a second "1. Scope" marker before it is
 # considered bleed from the next standard (not a duplicate heading).
 _SCOPE_BLEED_MIN_OFFSET = 100
 # How far back (chars) to search the previous block for a stolen scope.
 _SCOPE_TAIL_WINDOW = 900
 SUB_CATEGORY_RE = re.compile(
    r"^(?:AGGREGATES|CEMENT|LIME|STONE|TIMBER|BITUMEN|GYPSUM|PIPE|WIRE|STEEL|"
    r"ASBESTOS|CONCRETE|MASONRY|CERAMIC|GLASS|PLASTIC|RUBBER|METAL|WOOD|"
@@ -356,6 +367,82 @@ def split_into_standards(full_text: str) -> list[dict]:
        split_count,
    )
    # --- Pass 3: recover scope text stolen by previous block ---
    # The SP-21 PDF sometimes prints a standard's scope content on the same page
    # as the preceding "SUMMARY OF IS N+1" header — so after PDF text extraction
    # the scope ends up at the tail of block N instead of the head of block N+1.
    # Detect: block N+1 body opens at section 2+ with no preceding 1. Scope.
    # Remedy: find the last "1. Scope" occurrence in block N's tail and move
    # everything from that point to the start of block N+1's body.
    recovered = 0
    for i in range(1, len(all_blocks)):
        cur = all_blocks[i]
        cur_lines = cur.strip().split("\n")
        # Gather the full body text (everything after the first line, which is the IS ID).
        full_body = "\n".join(cur_lines[1:])
        scope_m = _SCOPE_START_RE.search(full_body)
        sec2_m = _BODY_SECTION_RE.search(full_body)
        # A real scope precedes section 2; if so, no repair needed.
        if scope_m and (sec2_m is None or scope_m.start() < sec2_m.start()):
            continue
        if sec2_m is None:
            continue
        prev = all_blocks[i - 1]
        search_tail = prev[-_SCOPE_TAIL_WINDOW:]
        scope_m = _SCOPE_START_RE.search(search_tail)
        if not scope_m:
            continue
        abs_scope_pos = len(prev) - _SCOPE_TAIL_WINDOW + scope_m.start()
        # rfind to start of line so we grab the full scope heading
        newline_pos = prev.rfind("\n", 0, abs_scope_pos)
        cut_pos = newline_pos + 1 if newline_pos != -1 else abs_scope_pos
        stolen = prev[cut_pos:].strip()
        trimmed_prev = prev[:cut_pos].rstrip()
        header_line = cur_lines[0]
        all_blocks[i] = f"{header_line}\n{stolen}\n{full_body}".strip()
        all_blocks[i - 1] = trimmed_prev
        recovered += 1
        log.info(
            "Pass 3: recovered stolen scope for block %d (%s…) from block %d",
            i, header_line[:50], i - 1,
        )
    if recovered:
        log.info("Pass 3: recovered scope for %d block(s)", recovered)
    # --- Pass 4: truncate next-standard bleed ---
    # Some blocks have the scope/sections of the FOLLOWING standard appended at
    # the end (the mirror of the Pass 3 problem).  If a body contains a second
    # "1. Scope" marker after _SCOPE_BLEED_MIN_OFFSET characters, everything
    # from that point belongs to the next standard and is removed.
    trimmed_bleed = 0
    for i, block in enumerate(all_blocks):
        lines = block.strip().split("\n", 1)
        if len(lines) < 2:
            continue
        header_line, body_text = lines[0], lines[1]
        first_scope = _SCOPE_START_RE.search(body_text)
        if first_scope is None:
            continue
        search_from = first_scope.end()
        second_scope = _SCOPE2_RE.search(body_text, search_from)
        if second_scope and second_scope.start() > _SCOPE_BLEED_MIN_OFFSET:
            trimmed = body_text[: second_scope.start()].rstrip()
            all_blocks[i] = f"{header_line}\n{trimmed}"
            trimmed_bleed += 1
            log.info(
                "Pass 4: trimmed next-standard bleed from block %d (%s…) at pos %d",
                i, header_line[:50], second_scope.start(),
            )
    if trimmed_bleed:
        log.info("Pass 4: trimmed bleed from %d block(s)", trimmed_bleed)
    # --- Parse each block + validate ---
    standards: list[dict] = []
    for block in all_blocks: