Source code for veil.entity_detectors.spacy.spacy_entity_detector

from __future__ import annotations

from typing import List, Optional

from veil.config.entity_detectors import SpacyEntityDetectorConfig
from veil.core.base_entity_detector import BaseEntityDetector
from veil.core.document import Document
from veil.core.span import Span
from veil.logger import init_logger

from .spacy_entity_type import SpacyEntityType

logger = init_logger(__name__)


[docs] class SpacyEntityDetector(BaseEntityDetector[SpacyEntityType]): """Adapter over spaCy NER models.""" ENTITY_TYPES = {e for e in SpacyEntityType} def __init__(self, config: SpacyEntityDetectorConfig): super().__init__(config) try: import spacy except ImportError as e: raise RuntimeError( "Spacy library not available. Please install it using: pip install spacy" ) from e # Configure device use_gpu = config.cuda_device >= 0 if use_gpu: try: spacy.require_gpu(config.cuda_device) logger.info(f"spaCy set to use GPU device {config.cuda_device}") except Exception as gpu_err: logger.warning( f"Failed to enable spaCy GPU on device {config.cuda_device}: {gpu_err}. Falling back to CPU." ) try: spacy.require_cpu() except Exception: pass else: try: spacy.require_cpu() except Exception: pass # Load the spaCy pipeline logger.info(f"Loading spaCy model '{config.model}'") self._nlp = spacy.load(config.model) # Build alias mapping for label conversion self._alias_map = SpacyEntityType._build_alias_map_for_subclass() def _map_label_to_entity(self, label: str) -> Optional[SpacyEntityType]: key = label.upper() canonical_name = self._alias_map.get(key) if canonical_name: try: return SpacyEntityType[canonical_name] except KeyError: return None return None
[docs] def detect_entities(self, doc: Document) -> List[Span]: text = doc.text or "" if not text: return [] spacy_doc = self._nlp(text) spans: List[Span] = [] for ent in spacy_doc.ents: entity_type = self._map_label_to_entity(ent.label_) if entity_type is None: continue spans.append( Span( start=ent.start_char, end=ent.end_char, entity_type=entity_type, replacement=ent.text, confidence=None, ) ) return spans