API
Anonymizer
Text anonymization module
Anonymizer
Anonymization class based on strategies formating
Source code in incognito_anonymizer/anonymizer.py
class Anonymizer:
"""Anonymization class based on strategies formating"""
# available strategies
ANALYZERS = {
"regex": analyzer.RegexStrategy(),
"pii": analyzer.PiiStrategy(),
}
# available masks
MASKS = {
"placeholder": mask.PlaceholderStrategy(),
"fake": mask.FakeStrategy(),
"hash": mask.HashStrategy(),
"hide": mask.HideStrategy(),
}
def __init__(self):
self._infos = None
self._position = []
self._mask = mask.PlaceholderStrategy()
self._analyzers = set()
def open_text_file(self, path: str) -> str:
"""
Open input txt file
:param path: path of the input txt file
:returns: file content
:raises FileExistsError: if given file not found
"""
try:
with open(path, "r") as f:
content = f.read()
return content
except FileNotFoundError as e:
print(e)
raise
def open_json_file(self, path: str) -> str:
"""
Open input json file for personal infos
:param path: path of the json file
:returns: file content
:raises FileExistsError: if given file not found
"""
try:
with open(path, "r") as f:
data = json.load(f)
return data
except FileNotFoundError as e:
print(e)
raise
def set_info(self, infos: PersonalInfo):
"""
Set personal info
:param infos: PersonalInfo
"""
self._infos = infos
def set_info_from_dict(self, **kwargs):
"""
Set dict to PersonalInfo Class
:param infos: dict with all the Personal info values
"""
clean_data = {
k: ("" if v is None else v.strftime("%Y-%m-%d")
if isinstance(v, datetime) else v)
for k, v in kwargs.items()
}
info_obj = PersonalInfo(**clean_data)
self.set_info(info_obj)
for key, value in vars(info_obj).items():
setattr(self, key, value)
return self
def add_analyzer(self, name: str):
"""
Add analyser
:param AnalyzerStrategy: analyzer used for anonymisation
"""
if name in self.ANALYZERS:
analyzer = self.ANALYZERS.get(name)
self._analyzers.add(analyzer)
else:
raise Exception(f"{name} analyzer doesn't exists")
def set_mask(self, name: str):
"""
Set masks
:param mask: wanted mask
"""
if name in self.MASKS:
self._mask = self.MASKS.get(name)
else:
raise Exception(f"{name} doesn't exists")
def anonymize(self, text: str) -> str:
"""
Global function to anonymise a text base on the choosen strategies
:param text: text to anonymize
:returns: anonimized text
"""
if not text:
text = "NaN"
spans = {}
for strategy in self._analyzers:
strategy.info = self._infos
span = strategy.analyze(text=text)
spans.update(span)
anonymized_text = self._mask.mask(text, spans)
text = anonymized_text
spans = {}
return anonymized_text
add_analyzer(name)
Add analyser
:param AnalyzerStrategy: analyzer used for anonymisation
Source code in incognito_anonymizer/anonymizer.py
def add_analyzer(self, name: str):
"""
Add analyser
:param AnalyzerStrategy: analyzer used for anonymisation
"""
if name in self.ANALYZERS:
analyzer = self.ANALYZERS.get(name)
self._analyzers.add(analyzer)
else:
raise Exception(f"{name} analyzer doesn't exists")
anonymize(text)
Global function to anonymise a text base on the choosen strategies
:param text: text to anonymize :returns: anonimized text
Source code in incognito_anonymizer/anonymizer.py
def anonymize(self, text: str) -> str:
"""
Global function to anonymise a text base on the choosen strategies
:param text: text to anonymize
:returns: anonimized text
"""
if not text:
text = "NaN"
spans = {}
for strategy in self._analyzers:
strategy.info = self._infos
span = strategy.analyze(text=text)
spans.update(span)
anonymized_text = self._mask.mask(text, spans)
text = anonymized_text
spans = {}
return anonymized_text
open_json_file(path)
Open input json file for personal infos
:param path: path of the json file :returns: file content :raises FileExistsError: if given file not found
Source code in incognito_anonymizer/anonymizer.py
def open_json_file(self, path: str) -> str:
"""
Open input json file for personal infos
:param path: path of the json file
:returns: file content
:raises FileExistsError: if given file not found
"""
try:
with open(path, "r") as f:
data = json.load(f)
return data
except FileNotFoundError as e:
print(e)
raise
open_text_file(path)
Open input txt file
:param path: path of the input txt file :returns: file content :raises FileExistsError: if given file not found
Source code in incognito_anonymizer/anonymizer.py
def open_text_file(self, path: str) -> str:
"""
Open input txt file
:param path: path of the input txt file
:returns: file content
:raises FileExistsError: if given file not found
"""
try:
with open(path, "r") as f:
content = f.read()
return content
except FileNotFoundError as e:
print(e)
raise
set_info(infos)
Set personal info
:param infos: PersonalInfo
Source code in incognito_anonymizer/anonymizer.py
def set_info(self, infos: PersonalInfo):
"""
Set personal info
:param infos: PersonalInfo
"""
self._infos = infos
set_info_from_dict(**kwargs)
Set dict to PersonalInfo Class
:param infos: dict with all the Personal info values
Source code in incognito_anonymizer/anonymizer.py
def set_info_from_dict(self, **kwargs):
"""
Set dict to PersonalInfo Class
:param infos: dict with all the Personal info values
"""
clean_data = {
k: ("" if v is None else v.strftime("%Y-%m-%d")
if isinstance(v, datetime) else v)
for k, v in kwargs.items()
}
info_obj = PersonalInfo(**clean_data)
self.set_info(info_obj)
for key, value in vars(info_obj).items():
setattr(self, key, value)
return self
set_mask(name)
Set masks
:param mask: wanted mask
Source code in incognito_anonymizer/anonymizer.py
def set_mask(self, name: str):
"""
Set masks
:param mask: wanted mask
"""
if name in self.MASKS:
self._mask = self.MASKS.get(name)
else:
raise Exception(f"{name} doesn't exists")
Anonlyzer
AnalyzerStrategy
Constructeur de la Class Strategy
Source code in incognito_anonymizer/analyzer.py
class AnalyzerStrategy:
"""Constructeur de la Class Strategy"""
def analyze(text):
raise NotImplementedError()
PiiStrategy
Bases: AnalyzerStrategy
Detect personal infos
Source code in incognito_anonymizer/analyzer.py
class PiiStrategy(AnalyzerStrategy):
"""Detect personal infos"""
def __init__(self):
self.info: PersonalInfo = None
def hide_by_keywords(
self, text: str, keywords: Iterable[Tuple[str, str]]
) -> Dict[Tuple[int, int], str]:
"""
Hide text using keywords and return positions with replacements.
:param text: text to anonymize
:param keywords: Iterable of tuples (word, replacement).
:returns: List of tuples where each tuple contains:
- A tuple with the start and end positions of the word.
- The replacement string.
"""
processor = KeywordProcessor(case_sensitive=False)
for key, masks in keywords:
processor.add_keyword(key, masks)
# Extract keywords with positions
found_keywords = processor.extract_keywords(text, span_info=True)
result = {}
for replacement, start, end in found_keywords:
# Wrap positions as a tuple of tuples
key = ((start, end),)
if key in result:
result[key] = replacement # Handle multiple occurrences
else:
result[key] = replacement
return result
def analyze(self, text: str) -> str:
"""
Hide specific words based on keywords
:param text: text to anonymize
"""
keywords: tuple
print(self.info)
try:
if isinstance(self.info, PersonalInfo):
keywords = (
(self.info.first_name, "<NAME>"),
(self.info.last_name, "<NAME>"),
(self.info.birth_name, "<NAME>"),
(self.info.ipp, "<IPP>"),
(self.info.postal_code, "<CODE_POSTAL>"),
(self.info.birthdate.strftime("%m/%d/%Y"), "<DATE>"),
(self.info.birthdate.strftime("%m %d %Y"), "<DATE>"),
(self.info.birthdate.strftime("%m:%d:%Y"), "<DATE>"),
(self.info.birthdate.strftime("%m-%d-%Y"), "<DATE>"),
(self.info.birthdate.strftime("%Y-%m-%d"), "<DATE>"),
(self.info.birthdate.strftime("%d/%m/%Y"), "<DATE>"),
(self.info.adress, "<ADRESSE>"),
)
return self.hide_by_keywords(text, [(info, tag) for info, tag in keywords if info])
except Exception as e:
print(f"Error : {e}. Given infos not in text")
pass
analyze(text)
Hide specific words based on keywords
:param text: text to anonymize
Source code in incognito_anonymizer/analyzer.py
def analyze(self, text: str) -> str:
"""
Hide specific words based on keywords
:param text: text to anonymize
"""
keywords: tuple
print(self.info)
try:
if isinstance(self.info, PersonalInfo):
keywords = (
(self.info.first_name, "<NAME>"),
(self.info.last_name, "<NAME>"),
(self.info.birth_name, "<NAME>"),
(self.info.ipp, "<IPP>"),
(self.info.postal_code, "<CODE_POSTAL>"),
(self.info.birthdate.strftime("%m/%d/%Y"), "<DATE>"),
(self.info.birthdate.strftime("%m %d %Y"), "<DATE>"),
(self.info.birthdate.strftime("%m:%d:%Y"), "<DATE>"),
(self.info.birthdate.strftime("%m-%d-%Y"), "<DATE>"),
(self.info.birthdate.strftime("%Y-%m-%d"), "<DATE>"),
(self.info.birthdate.strftime("%d/%m/%Y"), "<DATE>"),
(self.info.adress, "<ADRESSE>"),
)
return self.hide_by_keywords(text, [(info, tag) for info, tag in keywords if info])
except Exception as e:
print(f"Error : {e}. Given infos not in text")
pass
hide_by_keywords(text, keywords)
Hide text using keywords and return positions with replacements.
:param text: text to anonymize :param keywords: Iterable of tuples (word, replacement).
:returns: List of tuples where each tuple contains: - A tuple with the start and end positions of the word. - The replacement string.
Source code in incognito_anonymizer/analyzer.py
def hide_by_keywords(
self, text: str, keywords: Iterable[Tuple[str, str]]
) -> Dict[Tuple[int, int], str]:
"""
Hide text using keywords and return positions with replacements.
:param text: text to anonymize
:param keywords: Iterable of tuples (word, replacement).
:returns: List of tuples where each tuple contains:
- A tuple with the start and end positions of the word.
- The replacement string.
"""
processor = KeywordProcessor(case_sensitive=False)
for key, masks in keywords:
processor.add_keyword(key, masks)
# Extract keywords with positions
found_keywords = processor.extract_keywords(text, span_info=True)
result = {}
for replacement, start, end in found_keywords:
# Wrap positions as a tuple of tuples
key = ((start, end),)
if key in result:
result[key] = replacement # Handle multiple occurrences
else:
result[key] = replacement
return result
RegexStrategy
Bases: AnalyzerStrategy
Detect word based on regex
Source code in incognito_anonymizer/analyzer.py
class RegexStrategy(AnalyzerStrategy):
"""Detect word based on regex"""
def __init__(self):
Xxxxx = r"[A-ZÀ-Ÿ]\p{Ll}+"
XXxX_ = r"[A-ZÀ-Ÿ][A-ZÀ-Ÿ\p{Ll}-]"
sep = r"(?:[ ]*|-)?"
self.title_regex = r"([Dd][Rr][.]?|[Dd]octeur|[mM]r?[.]?|[Ii]nterne[ ]*:?|INT|[Ee]xterne[ ]*:?|[Mm]onsieur|[Mm]adame|[Rr].f.rent[ ]*:?|[P]r[.]?|[Pp]rofesseure|[Pp]rofesseur|\s[Mm]me[.]?|[Ee]nfant|[Mm]lle|[Nn]ée?)"
self.email_pattern = r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"
self.PATTERNS = {
# rf"(?<={self.title_regex})([\s-][A-Z]+)+([\s-][A-Z][a-z]+)+(?![a-z])": "<NAME>",
rf"(?<={self.title_regex}[ ]+)(?P<LN0>[A-ZÀ-Ÿ][A-ZÀ-Ÿ](?:{sep}(?:ep[.]|de|[A-ZÀ-Ÿ]+))*)[ ]+(?P<FN0>{Xxxxx}(?:{sep}{Xxxxx})*)": "<NAME>",
rf"(?<={self.title_regex}[ ]+)(?P<FN1>{Xxxxx}(?:{sep}{Xxxxx})*)[ ]+(?P<LN1>[A-ZÀ-Ÿ][A-ZÀ-Ÿ]+(?:{sep}(?:ep[.]|de|[A-ZÀ-Ÿ]+))*)": "<NAME>",
rf"(?<={self.title_regex}[ ]+)(?P<LN3>{Xxxxx}(?:(?:-|[ ]de[ ]|[ ]ep[.][ ]){Xxxxx})*)[ ]+(?P<FN2>{Xxxxx}(?:-{Xxxxx})*)": "<NAME>",
rf"(?<={self.title_regex}[ ]+)(?P<LN2>{XXxX_}+(?:{sep}{XXxX_}+)*)": "<NAME>",
rf"(?<={self.title_regex}[ ]+)(?P<FN0>[A-ZÀ-Ÿ][.])\s+(?P<LN0>{XXxX_}+(?:{sep}{XXxX_}+)*)": "<NAME>",
r"[12]\s*[0-9]{2}\s*(0[1-9]|1[0-2])\s*(2[AB]|[0-9]{2})\s*[0-9]{3}\s*[0-9]{3}\s*(?:\(?([0-9]{2})\)?)?": "<NIR>",
r"(?:(?:\+|00)33|0)[ \t]*[1-9](?:[ \t.-]*\d{2}){4}": "<PHONE>",
self.email_pattern: "<EMAIL>"
}
def multi_subs_by_regex(self, text: str) -> Dict[Tuple[Tuple[int, int]], str]:
"""
Find word position based on regex
:param text: text to anonymise
:returns: List of tuples where each tuple contains:
- A tuple with the start and end positions of the word.
- The replacement string.
"""
self.position = {}
for pattern, repl in self.PATTERNS.items():
matches = regex.findall(pattern, text, overlapped=True)
if matches:
spans = [match.span() for match in regex.finditer(
pattern, text, overlapped=True)]
existing_keys = list(self.position.keys())
overlapping_keys = []
for key in existing_keys:
if any(span in key for span in spans) or any(k in spans for k in key):
overlapping_keys.append(key)
if overlapping_keys:
combined_key = tuple(
sorted(
set(span for key in overlapping_keys for span in key).union(spans))
)
for key in overlapping_keys:
del self.position[key]
self.position[combined_key] = repl
else:
self.position[tuple(spans)] = repl
return self.position
def analyze(self, text: str):
"""
Hide text using regular expression
:param text: text to anonymize
"""
return self.multi_subs_by_regex(text)
analyze(text)
Hide text using regular expression :param text: text to anonymize
Source code in incognito_anonymizer/analyzer.py
def analyze(self, text: str):
"""
Hide text using regular expression
:param text: text to anonymize
"""
return self.multi_subs_by_regex(text)
multi_subs_by_regex(text)
Find word position based on regex
:param text: text to anonymise :returns: List of tuples where each tuple contains: - A tuple with the start and end positions of the word. - The replacement string.
Source code in incognito_anonymizer/analyzer.py
def multi_subs_by_regex(self, text: str) -> Dict[Tuple[Tuple[int, int]], str]:
"""
Find word position based on regex
:param text: text to anonymise
:returns: List of tuples where each tuple contains:
- A tuple with the start and end positions of the word.
- The replacement string.
"""
self.position = {}
for pattern, repl in self.PATTERNS.items():
matches = regex.findall(pattern, text, overlapped=True)
if matches:
spans = [match.span() for match in regex.finditer(
pattern, text, overlapped=True)]
existing_keys = list(self.position.keys())
overlapping_keys = []
for key in existing_keys:
if any(span in key for span in spans) or any(k in spans for k in key):
overlapping_keys.append(key)
if overlapping_keys:
combined_key = tuple(
sorted(
set(span for key in overlapping_keys for span in key).union(spans))
)
for key in overlapping_keys:
del self.position[key]
self.position[combined_key] = repl
else:
self.position[tuple(spans)] = repl
return self.position
Mask
FakeStrategy
Bases: Strategy
Replace word by natural placeholder
Source code in incognito_anonymizer/mask.py
class FakeStrategy(Strategy):
"""Replace word by natural placeholder"""
def __init__(self):
self.natural_placehodler = {
"<PER>": "Margaret Hamilton",
"<NAME>": "Margaret Hamilton",
"<CODE_POSTAL>": "42000",
"<DATE>": "1970/01/01",
"<IPP>": "IPPPH:0987654321",
"<NIR>": "012345678987654",
"<EMAIL>": "place.holder@anonymization.cdc",
"<PHONE>": "0611223344",
"<ADRESSE>": "35 Rue Margaret Hamilton",
}
def mask(self, text: str, coordinate: Dict[List[Tuple], str]) -> str:
"""
Replace in text, words at the given coordinates by a natural palceholder.
:param test: text to anonymize
:param coordinate: position and placehoder of the word to replace
:returns: anonymzed text
Example :
>>> anonymizer = FakeStrategy()
>>> text = "Bob"
>>> coordinate = {((0,3),): '<NAME>',}
>>> anonymizer.mask(text, coordinate)
'Margaret Hamilton'
"""
text_as_list = list(text)
all_positions = []
for spans, repl in coordinate.items():
repl = self.natural_placehodler[repl]
all_positions.extend((start, end, repl) for start, end in spans)
all_positions.sort(key=lambda x: x[0], reverse=True)
for start, end, repl in all_positions:
text_as_list[start:end] = list(repl)
return "".join(text_as_list)
mask(text, coordinate)
Replace in text, words at the given coordinates by a natural palceholder.
:param test: text to anonymize :param coordinate: position and placehoder of the word to replace :returns: anonymzed text
Example :
anonymizer = FakeStrategy() text = "Bob" coordinate = {((0,3),): '
',} anonymizer.mask(text, coordinate) 'Margaret Hamilton'
Source code in incognito_anonymizer/mask.py
def mask(self, text: str, coordinate: Dict[List[Tuple], str]) -> str:
"""
Replace in text, words at the given coordinates by a natural palceholder.
:param test: text to anonymize
:param coordinate: position and placehoder of the word to replace
:returns: anonymzed text
Example :
>>> anonymizer = FakeStrategy()
>>> text = "Bob"
>>> coordinate = {((0,3),): '<NAME>',}
>>> anonymizer.mask(text, coordinate)
'Margaret Hamilton'
"""
text_as_list = list(text)
all_positions = []
for spans, repl in coordinate.items():
repl = self.natural_placehodler[repl]
all_positions.extend((start, end, repl) for start, end in spans)
all_positions.sort(key=lambda x: x[0], reverse=True)
for start, end, repl in all_positions:
text_as_list[start:end] = list(repl)
return "".join(text_as_list)
HashStrategy
Bases: Strategy
Replace les mots par leur hash
Source code in incognito_anonymizer/mask.py
class HashStrategy(Strategy):
"""Replace les mots par leur hash"""
# TODO : blake256 8 digits et paper bourrin(20ene de bytes)
pass
HideStrategy
Bases: Strategy
Replace by *
Source code in incognito_anonymizer/mask.py
class HideStrategy(Strategy):
"""Replace by *"""
def mask(self, text, coordinate: Dict[List[Tuple], str]) -> str:
"""
Replace in text, words at the given coordinates by *.
:param test: text to anonymize
:param coordinate: position and placehoder of the word to replace
:returns: anonymzed text
Example :
>>> anonymizer = HideStrategy()
>>> text = "Bob"
>>> coordinate = {((0,3),): '<NAME>',}
>>> anonymizer.mask(text, coordinate)
'********'
"""
text_as_list = list(text)
all_positions = []
for spans, repl in coordinate.items():
all_positions.extend((start, end, repl) for start, end in spans)
all_positions.sort(key=lambda x: x[0], reverse=True)
for start, end, repl in all_positions:
word_len = end - start
replacement = "*" * (8 if word_len < 5 else word_len)
text_as_list[start:end] = list(replacement)
return "".join(text_as_list)
mask(text, coordinate)
Replace in text, words at the given coordinates by *. :param test: text to anonymize :param coordinate: position and placehoder of the word to replace :returns: anonymzed text
Example :
anonymizer = HideStrategy() text = "Bob" coordinate = {((0,3),): '
',} anonymizer.mask(text, coordinate) '**'
Source code in incognito_anonymizer/mask.py
def mask(self, text, coordinate: Dict[List[Tuple], str]) -> str:
"""
Replace in text, words at the given coordinates by *.
:param test: text to anonymize
:param coordinate: position and placehoder of the word to replace
:returns: anonymzed text
Example :
>>> anonymizer = HideStrategy()
>>> text = "Bob"
>>> coordinate = {((0,3),): '<NAME>',}
>>> anonymizer.mask(text, coordinate)
'********'
"""
text_as_list = list(text)
all_positions = []
for spans, repl in coordinate.items():
all_positions.extend((start, end, repl) for start, end in spans)
all_positions.sort(key=lambda x: x[0], reverse=True)
for start, end, repl in all_positions:
word_len = end - start
replacement = "*" * (8 if word_len < 5 else word_len)
text_as_list[start:end] = list(replacement)
return "".join(text_as_list)
PlaceholderStrategy
Bases: Strategy
Replace by placeholders
Source code in incognito_anonymizer/mask.py
class PlaceholderStrategy(Strategy):
"""Replace by placeholders"""
def mask(self, text, coordinate: Dict[List[Tuple], str]) -> str:
"""
Replace in text, words at the given coordinates by a placeholder.
:param test: text to anonymize
:param coordinate: position and placehoder of the word to replace
:returns: anonymzed text
Example :
>>> anonymizer = PlaceholderStrategy()
>>> text = "Bob"
>>> coordinate = {((0,3),): '<NAME>',}
>>> anonymizer.mask(text, coordinate)
'<NAME>'
"""
text_as_list = list(text)
all_positions = []
for spans, repl in coordinate.items():
all_positions.extend((start, end, repl) for start, end in spans)
all_positions.sort(key=lambda x: x[0], reverse=True)
for start, end, repl in all_positions:
text_as_list[start:end] = list(repl)
return "".join(text_as_list)
mask(text, coordinate)
Replace in text, words at the given coordinates by a placeholder. :param test: text to anonymize :param coordinate: position and placehoder of the word to replace :returns: anonymzed text
Example :
anonymizer = PlaceholderStrategy() text = "Bob" coordinate = {((0,3),): '
',} anonymizer.mask(text, coordinate) ' '
Source code in incognito_anonymizer/mask.py
def mask(self, text, coordinate: Dict[List[Tuple], str]) -> str:
"""
Replace in text, words at the given coordinates by a placeholder.
:param test: text to anonymize
:param coordinate: position and placehoder of the word to replace
:returns: anonymzed text
Example :
>>> anonymizer = PlaceholderStrategy()
>>> text = "Bob"
>>> coordinate = {((0,3),): '<NAME>',}
>>> anonymizer.mask(text, coordinate)
'<NAME>'
"""
text_as_list = list(text)
all_positions = []
for spans, repl in coordinate.items():
all_positions.extend((start, end, repl) for start, end in spans)
all_positions.sort(key=lambda x: x[0], reverse=True)
for start, end, repl in all_positions:
text_as_list[start:end] = list(repl)
return "".join(text_as_list)