Source code for veil.config.entity_detectors

from dataclasses import field
from typing import Dict, List

from veil.config.core.base_poly_config import BasePolyConfig
from veil.config.core.frozen_dataclass import frozen_dataclass
from veil.core.enums.entity_detector_type import EntityDetectorType


[docs] @frozen_dataclass class BaseEntityDetectorConfig(BasePolyConfig): """Marker base for entity detector configuration objects.""" # Priority per canonical entity type name. Lower number = higher priority (0 is highest). # When multiple detectors produce overlapping spans, the span with the # lowest priority value for its entity type will be preferred. priority: Dict[str, int] = field( default_factory=dict, metadata={ "help": "Map canonical entity type name -> integer priority. Lower wins; 0 is highest.", }, ) # Detector hierarchy position: 0 is highest precedence across components. hierarchy_position: int = field( default=0, metadata={ "help": "Detector-level hierarchy position. 0 is highest precedence; larger = lower.", }, )
[docs] @classmethod def get_type(cls): raise NotImplementedError
[docs] @frozen_dataclass class RegexEntityDetectorConfig(BaseEntityDetectorConfig): """Configuration wrapper for Veil’s RegexEntityDetector engine.""" enable_validation: bool = field( default=True, metadata={"help": "Enable checksum validation where available"} ) min_confidence: float = field( default=0.0, metadata={"help": "Minimum confidence threshold to keep entity"} ) preserve_format: bool = field( default=True, metadata={"help": "Preserve original spacing/punctuation where possible"}, ) case_sensitive: bool = field( default=False, metadata={"help": "Case-sensitive regex search"} )
[docs] @classmethod def get_type(cls): return EntityDetectorType.REGEX
[docs] @frozen_dataclass class MaskerApiEntityDetectorConfig(BaseEntityDetectorConfig): """Marker base for masking-API entity detector configuration objects.""" api_url: str = field( default="", metadata={"help": "URL of the hosted Masker API."}, ) headers: Dict[str, str] = field( default_factory=dict, metadata={"help": "Headers to send to the hosted Masker API."}, ) model: str = field( default="", metadata={"help": "Model to use for the hosted Masker API."}, ) system_prompt: str = field( default="You are a helpful assistant that masks sensitive information.", metadata={"help": "System prompt to send to the hosted Masker API."}, ) max_tokens: int = field( default=4000, metadata={"help": "Maximum number of tokens to generate."}, ) top_p: float = field( default=1, metadata={"help": "Top-p value for the hosted Masker API."}, ) top_k: int = field( default=40, metadata={"help": "Top-k value for the hosted Masker API."}, ) presence_penalty: float = field( default=0, metadata={"help": "Presence penalty for the hosted Masker API."}, ) frequency_penalty: float = field( default=0, metadata={"help": "Frequency penalty for the hosted Masker API."}, ) temperature: float = field( default=0.6, metadata={"help": "Temperature for the hosted Masker API."}, ) timeout: float = field( default=30, metadata={"help": "Timeout for the hosted Masker API."}, ) retries: int = field( default=2, metadata={ "help": "Number of retries on network errors or suspected truncation." }, ) retry_backoff_base: float = field( default=0.5, metadata={"help": "Base seconds for exponential backoff between retries."}, ) retry_on_truncation: bool = field( default=True, metadata={"help": "Retry when truncation is detected in the response."}, ) chunk_on_truncation: bool = field( default=True, metadata={"help": "Fallback to chunking the input if truncation persists."}, ) chunk_char_limit: int = field( default=4000, metadata={ "help": "Approximate maximum characters per chunk when chunking input." }, ) truncation_min_fraction: float = field( default=0.6, metadata={ "help": "Minimum fraction of original length expected; smaller suggests truncation." }, )
[docs] @classmethod def get_type(cls): return EntityDetectorType.MASKER_API
def __post_init__(self): if not self.api_url: raise ValueError("Masking API URL not provided in config.") if not self.model: raise ValueError("Model not provided in config. Expected 'model'.")
[docs] @frozen_dataclass class HostedMaskerApiEntityDetectorConfig(MaskerApiEntityDetectorConfig): """Configuration for the hosted Masker API entity detector integration."""
[docs] @classmethod def get_type(cls): return EntityDetectorType.HOSTED_MASKER_API
[docs] @frozen_dataclass class GlinerEntityDetectorConfig(BaseEntityDetectorConfig): """Configuration for the Gliner entity detector integration.""" labels: List[str] = field( default_factory=lambda: ["name", "company", "address"], metadata={"help": "Entity types to detect."}, ) model: str = field( default="urchade/gliner_multi-v2.1", metadata={"help": "Name of the HF Gliner model to be used."}, ) cuda_device: int = field( default=0, metadata={"help": "CUDA device number to use (-1 for CPU)"} ) threshold: float = field( default=0.6, metadata={ "help": "Score threshold. Increase for higher precision (fewer spans)." }, ) batch_size: int = field(default=8, metadata={"help": "Batch size for inference."}) max_length: int = field( default=384, metadata={ "help": "Maximum sequence length in tokens. Documents longer than this will be chunked." }, ) chunk_overlap: int = field( default=50, metadata={ "help": "Number of tokens to overlap between chunks when processing long documents." }, ) # Post-processing controls nms_iou_threshold: float = field( default=0.8, metadata={ "help": "IoU threshold for merging overlapping spans across chunks (higher = more aggressive merge)." }, ) min_span_chars: int = field( default=3, metadata={"help": "Drop spans shorter than this many characters."}, ) max_span_chars: int = field( default=80, metadata={"help": "Drop spans longer than this many characters."}, ) top_k_per_chunk: int = field( default=100, metadata={ "help": "Keep only top-K highest-scoring spans per chunk after thresholding (0 disables)." }, )
[docs] @classmethod def get_type(cls): return EntityDetectorType.GLINER
[docs] @frozen_dataclass class SpacyEntityDetectorConfig(BaseEntityDetectorConfig): """Configuration for the Spacy entity detector integration.""" model: str = field( default="es_core_news_sm", metadata={"help": "Name of the Spacy model to be used."}, ) cuda_device: int = field( default=0, metadata={"help": "CUDA device number to use (-1 for CPU)"} )
[docs] @classmethod def get_type(cls): return EntityDetectorType.SPACY