-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_utils.py
More file actions
114 lines (83 loc) · 3.01 KB
/
parse_utils.py
File metadata and controls
114 lines (83 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from __future__ import annotations
import ast
import html as _html
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Sequence, Tuple
TABLE_RE = re.compile(r"<table.*?>.*?</table>", re.IGNORECASE | re.DOTALL)
ROW_RE = re.compile(r"<tr.*?>(.*?)</tr>", re.IGNORECASE | re.DOTALL)
CELL_RE = re.compile(r"<t[dh].*?>(.*?)</t[dh]>", re.IGNORECASE | re.DOTALL)
REFDET_RE = re.compile(r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)", re.DOTALL)
URL_RE = re.compile(r"(https?://[^\s\]\)\"\'<>]+)")
MD_LINK_RE = re.compile(r"\[[^\]]+\]\((https?://[^\s\)]+)\)")
def clean_cell_text(cell_html: str) -> str:
text = re.sub(r"<.*?>", "", cell_html, flags=re.DOTALL)
text = _html.unescape(text)
text = re.sub(r"\s+", " ", text).strip()
return text
def html_table_to_markdown(html_table: str) -> str:
rows: List[List[str]] = []
for row_match in ROW_RE.finditer(html_table):
cells_raw = CELL_RE.findall(row_match.group(1))
cells = [clean_cell_text(c) for c in cells_raw]
if cells:
rows.append(cells)
if not rows:
return html_table
max_cols = max(len(r) for r in rows)
for r in rows:
if len(r) < max_cols:
r.extend([""] * (max_cols - len(r)))
header = rows[0]
sep = ["---"] * max_cols
md_lines = [
"| " + " | ".join(header) + " |",
"| " + " | ".join(sep) + " |",
]
for r in rows[1:]:
md_lines.append("| " + " | ".join(r) + " |")
return "\n".join(md_lines)
def extract_tables(text: str) -> List[str]:
return TABLE_RE.findall(text)
def replace_tables_inline_markdown(text: str) -> str:
def repl(m: re.Match) -> str:
return html_table_to_markdown(m.group(0))
return TABLE_RE.sub(repl, text)
def extract_hyperlinks(text: str) -> List[str]:
found = set()
for m in URL_RE.finditer(text):
found.add(m.group(1))
for m in MD_LINK_RE.finditer(text):
found.add(m.group(1))
return sorted(found)
def re_match_refdet(text: str):
matches = REFDET_RE.findall(text)
# matches: list of tuples (full, label_type, coords_str)
return matches
def extract_coordinates_and_label(ref_tuple) -> Optional[Tuple[str, List[List[float]]]]:
try:
full, label_type, coords_str = ref_tuple
cor_list = ast.literal_eval(coords_str)
if isinstance(cor_list, (list, tuple)):
if not cor_list:
return None
first = cor_list[0]
if isinstance(first, (int, float)):
cor_list = [cor_list]
else:
return None
except Exception:
return None
# normalize to list[list[float]]
out: List[List[float]] = []
for p in cor_list:
if not isinstance(p, (list, tuple)) or len(p) != 4:
continue
out.append([float(p[0]), float(p[1]), float(p[2]), float(p[3])])
if not out:
return None
return label_type, out
@dataclass
class Region:
bbox: Tuple[int, int, int, int] # x1,y1,x2,y2
full_tag: str