nerpa / anonymise.py

akhatre

add reference to gliner2 zero shot capabilities

29ae185 4 months ago

9.43 kB

	"""
	NERPA – Text anonymisation using the fine-tuned GLiNER2 model.

	Usage:
	python anonymise.py "My name is John Smith, born 15/03/1990. Email: john@example.com"
	python anonymise.py --file input.txt
	python anonymise.py --file input.txt --output anonymised.txt
	"""

	import argparse
	import logging
	import sys
	import warnings
	from typing import Optional

	warnings.filterwarnings("ignore", message=r".incorrect regex pattern.fix_mistral_regex.*")

	import torch
	from gliner2 import GLiNER2

	logger = logging.getLogger(__name__)

	# Entity types the model was fine-tuned to recognise, with descriptions
	# that guide the bi-encoder towards better detection.
	PII_ENTITIES: dict[str, str] = {
	"LOCATION": "Address, country, city, postcode, street, any other location",
	"AGE": "Age of a person",
	"DIGITAL_KEYS": "Digital keys, passwords, pins used to access anything like servers, banks, APIs, accounts etc",
	"BANK_ACCOUNT_DETAILS": "Bank account details such as number, IBAN, SWIFT, routing numbers etc",
	"CARD_DETAILS": "Debit or credit card details such as card number, CVV, expiration etc",
	"DATE_TIME": "Generic date and time",
	"DATE_OF_BIRTH": "Date of birth",
	"PERSONAL_ID_NUMBERS": "Common personal identification numbers such as passport numbers, driving licenses, taxpayer and insurance numbers",
	"TECHNICAL_ID_NUMBERS": "IP and MAC addresses, serial numbers and any other technical ID numbers",
	"EMAIL": "Email",
	"PERSON_NAME": "Person name",
	"BUSINESS_NAME": "Business name",
	"PHONE": "Any personal or other phone numbers",
	"URL": "Any short or full URL",
	"USERNAME": "Username",
	"VEHICLE_ID_NUMBERS": "Any vehicle numbers like license plates, vehicle identification numbers",
	}

	CONFIDENCE_THRESHOLD = 0.25
	CHUNK_SIZE = 3000
	CHUNK_OVERLAP = 100
	BATCH_SIZE = 32


	def load_model(model_path: str = ".") -> GLiNER2:
	"""Load the NERPA model onto the best available device."""
	if torch.cuda.is_available():
	device = torch.device("cuda")
	elif torch.backends.mps.is_available():
	device = torch.device("mps")
	else:
	device = torch.device("cpu")

	model = GLiNER2.from_pretrained(model_path)
	try:
	model.to(device)
	except RuntimeError:
	logger.warning(
	"Failed to load model on %s, falling back to CPU.", device
	)
	model.to(torch.device("cpu"))
	return model


	def chunk_text(
	text: str,
	chunk_size: int = CHUNK_SIZE,
	overlap: int = CHUNK_OVERLAP,
	) -> tuple[list[str], list[int]]:
	"""Split text into overlapping chunks, returning chunks and their start offsets."""
	if not text:
	return [], []
	chunks: list[str] = []
	starts: list[int] = []
	step = chunk_size - overlap
	for pos in range(0, len(text), step):
	chunks.append(text[pos : pos + chunk_size])
	starts.append(pos)
	return chunks, starts


	def detect_entities(
	model: GLiNER2,
	text: str,
	entities: Optional[dict[str, str]] = None,
	threshold: float = CONFIDENCE_THRESHOLD,
	) -> list[dict]:
	"""
	Detect PII entities in text, returning a list of
	``{"type": str, "start": int, "end": int, "score": float}`` dicts
	with character offsets into the original text.
	"""
	entities = entities or PII_ENTITIES

	# Always detect both date types so the model can disambiguate.
	detect = dict(entities)
	if "DATE_TIME" in detect and "DATE_OF_BIRTH" not in detect:
	detect["DATE_OF_BIRTH"] = PII_ENTITIES["DATE_OF_BIRTH"]
	elif "DATE_OF_BIRTH" in detect and "DATE_TIME" not in detect:
	detect["DATE_TIME"] = PII_ENTITIES["DATE_TIME"]

	chunks, offsets = chunk_text(text)

	all_chunk_results: list[dict] = []
	for batch_start in range(0, len(chunks), BATCH_SIZE):
	batch = chunks[batch_start : batch_start + BATCH_SIZE]
	results = model.batch_extract_entities(
	batch,
	detect,
	include_confidence=True,
	include_spans=True,
	threshold=threshold,
	)
	all_chunk_results.extend(results)

	# Merge results across chunks: de-duplicate overlapping detections.
	seen: dict[tuple[int, int], dict] = {}
	for chunk_result, chunk_offset in zip(all_chunk_results, offsets):
	for label, occurrences in chunk_result["entities"].items():
	for occurrence in occurrences:
	start = occurrence["start"] + chunk_offset
	end = occurrence["end"] + chunk_offset
	position = (start, end)
	if (
	position not in seen
	or seen[position]["score"] < occurrence["confidence"]
	):
	seen[position] = {
	"type": label,
	"score": occurrence["confidence"],
	}

	# Merge overlapping spans, keeping the highest-confidence label.
	# NOTE: when two spans overlap they are fused into one span and
	# assigned the label with the higher confidence score.
	items = sorted(
	[
	(start, end, info)
	for (start, end), info in seen.items()
	if info["type"] in entities
	],
	key=lambda x: (x[0], x[1]),
	)
	if not items:
	return []

	merged: list[dict] = []
	current_start, current_end, current_info = items[0]
	for start, end, info in items[1:]:
	if start < current_end: # overlapping
	current_end = max(current_end, end)
	if info["score"] > current_info["score"]:
	current_info = info
	else:
	merged.append({
	"type": current_info["type"],
	"start": current_start,
	"end": current_end,
	"score": current_info["score"],
	})
	current_start, current_end, current_info = start, end, info
	merged.append({
	"type": current_info["type"],
	"start": current_start,
	"end": current_end,
	"score": current_info["score"],
	})

	return merged


	def anonymise(text: str, detected: list[dict]) -> str:
	"""Replace detected entities with placeholders like ``[PERSON_NAME]``."""
	parts: list[str] = []
	prev_end = 0
	for entity in sorted(detected, key=lambda e: e["start"]):
	parts.append(text[prev_end : entity["start"]])
	parts.append(f'[{entity["type"]}]')
	prev_end = entity["end"]
	parts.append(text[prev_end:])
	return "".join(parts)


	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Anonymise PII in text using the NERPA model.",
	)
	parser.add_argument(
	"text", nargs="?", help="Text to anonymise (or use --file)",
	)
	parser.add_argument(
	"--file", "-f", help="Read text from a file instead",
	)
	parser.add_argument(
	"--output", "-o",
	help="Write anonymised text to file (default: stdout)",
	)
	parser.add_argument(
	"--model", "-m", default=".",
	help="Path to model directory (default: current dir)",
	)
	parser.add_argument(
	"--threshold", "-t", type=float, default=CONFIDENCE_THRESHOLD,
	help=f"Confidence threshold (default: {CONFIDENCE_THRESHOLD})",
	)
	parser.add_argument(
	"--show-entities", action="store_true",
	help="Print detected entities before anonymised text",
	)
	parser.add_argument(
	"--extra-entities", "-e", action="append", metavar="LABEL=DESCRIPTION",
	help=(
	"Additional custom entity types to detect alongside the built-in "
	"PII entities. Repeat for each type. Format: LABEL=\"Description\". "
	"Example: -e PRODUCT=\"Product name\" -e SKILL=\"Professional skill\""
	),
	)
	args = parser.parse_args()

	if args.file:
	try:
	with open(args.file, encoding="utf-8") as f:
	text = f.read()
	except OSError as exc:
	sys.exit(f"Error reading {args.file}: {exc}")
	elif args.text:
	text = args.text
	else:
	parser.error("Provide text as an argument or use --file")

	extra: dict[str, str] = {}
	if args.extra_entities:
	for item in args.extra_entities:
	if "=" not in item:
	parser.error(
	f"Invalid --extra-entities value '{item}'. "
	"Expected format: LABEL=\"Description\""
	)
	label, description = item.split("=", 1)
	extra[label.strip()] = description.strip()

	model = load_model(args.model)
	all_entities = {PII_ENTITIES, extra} if extra else None
	detected = detect_entities(model, text, entities=all_entities, threshold=args.threshold)

	if args.show_entities:
	for entity in detected:
	span = text[entity["start"] : entity["end"]]
	logger.info(
	" %-25s [%5d:%5d] (score=%.2f) %r",
	entity["type"], entity["start"], entity["end"],
	entity["score"], span,
	)

	result = anonymise(text, detected)

	if args.output:
	try:
	with open(args.output, "w", encoding="utf-8") as f:
	f.write(result)
	except OSError as exc:
	sys.exit(f"Error writing {args.output}: {exc}")
	else:
	print(result)


	if __name__ == "__main__":
	main()