Skip to content

Commit ebbe9b1

Browse files
Fix donut backtracking (huggingface#37788)
* Fix donut backtracking * make fixup * Trigger tests * Remove old line * Update code * Fix reversed slice
1 parent 06c4d05 commit ebbe9b1

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

src/transformers/models/donut/processing_donut.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,14 +156,18 @@ def token2json(self, tokens, is_inner_value=False, added_vocab=None):
156156
output = {}
157157

158158
while tokens:
159-
start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
160-
if start_token is None:
159+
# We want r"<s_(.*?)>" but without ReDOS risk, so do it manually in two parts
160+
potential_start = re.search(r"<s_", tokens, re.IGNORECASE)
161+
if potential_start is None:
161162
break
162-
key = start_token.group(1)
163+
start_token = tokens[potential_start.start() :]
164+
if ">" not in start_token:
165+
break
166+
start_token = start_token[: start_token.index(">") + 1]
167+
key = start_token[len("<s_") : -len(">")]
163168
key_escaped = re.escape(key)
164169

165170
end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
166-
start_token = start_token.group()
167171
if end_token is None:
168172
tokens = tokens.replace(start_token, "")
169173
else:

0 commit comments

Comments
 (0)