Source code for veil.entity_detectors.regex.regex_entity_detector
"""
Pure regular-expression-based masker.
This masker uses optimized regex patterns to detect Spanish-sensitive
identifiers with checksum validation.
"""
from __future__ import annotations
import logging
from typing import List
from veil.config.entity_detectors import RegexEntityDetectorConfig
from veil.core.base_entity_detector import BaseEntityDetector
from veil.core.document import Document
from veil.core.span import Span
from veil.entity_detectors.regex.regex_entity_type import RegexEntityType
from .patterns import (
get_pattern_meta,
)
from .patterns import validate_pattern as validate_pattern_exact
from .validators import get_supported_validator_types, validate_identifier
logger = logging.getLogger(__name__)
[docs]
class RegexEntityDetector(BaseEntityDetector[RegexEntityType]):
"""
Pure regular-expression-based anonymizer.
Uses optimized regex patterns to detect sensitive identifiers with optional
checksum validation for higher precision.
"""
ENTITY_TYPES = {e for e in RegexEntityType}
def __init__(self, cfg: RegexEntityDetectorConfig):
"""Initialize the regex entity detector with a configuration."""
super().__init__(cfg)
self.enable_validation = cfg.enable_validation
self.min_confidence = cfg.min_confidence
self.case_sensitive = cfg.case_sensitive
[docs]
def detect_entities(self, doc: Document) -> List[Span]:
"""
Detect entities in the text using regex patterns.
Args:
doc: Document to analyze
Returns:
List[Span]: List of detected entities
"""
text = doc.text
entities: List[Span] = []
try:
# Search each entity type
for entity_type in RegexEntityType:
entities.extend(self._find_entities_of_type(text, entity_type))
except Exception as e:
logger.error(f"Error detecting entities: {e}")
raise
return entities
def _find_entities_of_type(
self, text: str, entity_type: RegexEntityType
) -> List[Span]:
"""
Find entities of a specific type in the text.
Args:
text: Text to analyze
entity_type: Entity type to search for
options: Anonymization options
Returns:
List[Span]: List of found entities
"""
entities = []
try:
# Get pattern metadata for this entity type
pattern_meta = get_pattern_meta(entity_type)
# Find matches
for match in pattern_meta["compiled"].finditer(text):
start = match.start()
end = match.end()
matched_text = match.group()
# Validate entity if validation is enabled
is_valid = True
if self.enable_validation:
is_valid = self._validate_entity(entity_type, matched_text)
# Create detected entity (aligned with `Span` fields)
entity = Span(
start=start,
end=end,
entity_type=entity_type,
replacement=matched_text,
confidence=(
float(pattern_meta["confidence"]) # type: ignore[index]
if is_valid
else float(pattern_meta["confidence"]) * 0.5 # type: ignore[index]
),
)
entities.append(entity)
except Exception as e:
logger.error(f"Error finding entities of type {entity_type}: {e}")
return entities
def _validate_entity(self, entity_type: RegexEntityType, text: str) -> bool:
"""
Validate an entity using checksums when possible.
Args:
entity_type: Entity type
text: Entity text
Returns:
bool: True if the entity is valid
"""
try:
# Only validate types that have checksums
if entity_type in get_supported_validator_types():
return validate_identifier(entity_type, text)
# For other types, assume valid if it matches the pattern
return True
except Exception as e:
logger.warning(f"Error validating entity {entity_type}: {e}")
return True # If there's an error, assume valid
[docs]
def get_pattern_info(self, entity_type: str) -> dict:
"""
Get pattern information for an entity type.
Args:
entity_type: Entity type
Returns:
dict: Pattern information
"""
try:
entity_enum = RegexEntityType[entity_type]
pattern_meta = get_pattern_meta(entity_enum)
return {
"pattern": pattern_meta["pattern"],
"confidence": pattern_meta["confidence"],
"description": pattern_meta["description"],
"has_validation": entity_type in get_supported_validator_types(),
}
except (ValueError, KeyError) as e:
logger.error(f"Error getting pattern info for {entity_type}: {e}")
return {}
[docs]
def test_pattern(self, entity_type: str, test_text: str) -> bool:
"""
Test whether a text matches a specific pattern.
Args:
entity_type: Entity type to test
test_text: Text to test
Returns:
bool: True if the text matches the pattern
"""
try:
entity_enum = RegexEntityType[entity_type]
return validate_pattern_exact(entity_enum, test_text)
except (ValueError, KeyError) as e:
logger.error(f"Error testing pattern for {entity_type}: {e}")
return False