Source code for veil.masker

from typing import List

from veil.config.masker import MaskerConfig
from veil.core.document import Document
from veil.core.enums.mask_type import MaskType
from veil.core.mask_result import MaskResult
from veil.core.span import Span


[docs] class Masker: def __init__(self, config: MaskerConfig): self.config = config
[docs] def mask(self, doc: Document, spans: List[Span]) -> MaskResult: """ Mask the text in document given a list of spans. """ if not spans: return MaskResult( doc_id=doc.doc_id, original_text=doc.text, masked_text=doc.text, entities=[], ) spans.sort(key=lambda x: x.start) masked_chunks: List[str] = [] cursor = 0 text = doc.text for span in spans: start = max(0, min(len(text), int(span.start))) end = max(0, min(len(text), int(span.end))) if end <= start: continue # append unmasked region if cursor < start: masked_chunks.append(text[cursor:start]) # compute mask token original_substring = text[start:end] # Do not mutate span (frozen dataclass). Use local original substring. masked_chunks.append(self._mask_for_span(span, original_substring)) cursor = end # tail if cursor < len(text): masked_chunks.append(text[cursor:]) masked_text = "".join(masked_chunks) return MaskResult( doc_id=doc.doc_id, original_text=text, masked_text=masked_text, entities=spans, )
def _mask_for_span(self, span: Span, original: str) -> str: mask_type = MaskType.from_str(self.config.method) if mask_type == MaskType.ASTERISK: return "*" * len(original) # Default ENTITY_TAG: [ENTITY] or [MASK] try: label = span.entity_type.name except AttributeError: raise ValueError(f"ISpan {span} has no entity type") span_id = getattr(span, "id", None) if span_id is not None: token = f"{label}{span_id}" else: token = label return f"[{token}]"