API

Anonymizer

Text anonymization module

Anonymizer

Anonymization class based on strategies formating

Source code in incognito_anonymizer/anonymizer.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
class Anonymizer:
    """Anonymization class based on strategies formating"""

    def __init__(self):
        # available strategies
        self.ANALYZERS = {
            "regex": analyzer.RegexStrategy(),
            "pii": analyzer.PiiStrategy(),
            "lossy": analyzer.LossyStrategy(),
        }

        # available masks
        self.MASKS = {
            "placeholder": mask.PlaceholderStrategy(),
            "fake": mask.FakeStrategy(),
            "hash": mask.HashStrategy(),
            "hide": mask.HideStrategy(),
        }

        # available annotator
        self.ANNOTATORS = {
            "standoff": anotate.StandoffStrategy(),
            "doccano": anotate.DoccanoStrategy(),
            "uimacas": anotate.UimaCasStrategy(),
        }

        self._infos = None
        self._position = []
        self._mask = mask.PlaceholderStrategy()
        self._analyzers = []
        self._annotator = None
        self._entities: list[DetectedEntity] = []

    def open_text_file(self, path: str) -> str:
        """
        Open input txt file

        :param path: path of the input txt file
        :returns: file content
        :raises FileExistsError: if given file not found
        """
        try:
            with open(path, "r") as f:
                content = f.read()
            return content
        except FileNotFoundError as e:
            print(e)
            raise

    def open_json_file(self, path: str) -> str:
        """
        Open input json file for personal infos

        :param path: path of the json file
        :returns: file content
        :raises FileExistsError: if given file not found
        """
        try:
            with open(path, "r") as f:
                data = json.load(f)
            return data
        except FileNotFoundError as e:
            print(e)
            raise

    def set_info(self, infos: PersonalInfo):
        """
        Set personal info

        :param infos: PersonalInfo
        """
        self._infos = infos
        return infos

    def set_info_from_dict(self, **kwargs):
        """
        Set dict to PersonalInfo Class

        :param infos: dict with all the Personal info values

        """
        clean_data = {
            k: (
                ""
                if v is None
                else v.strftime("%Y-%m-%d")
                if isinstance(v, datetime)
                else v
            )
            for k, v in kwargs.items()
        }
        info_obj = PersonalInfo(**clean_data)
        self._infos = info_obj
        return info_obj

    def add_analyzer(self, name: str):
        """
        Add analyser

        :param name: analyzer used for anonymisation

        """
        if name in self.ANALYZERS:
            analyzer = self.ANALYZERS.get(name)
            if analyzer not in self._analyzers:
                self._analyzers.append(analyzer)
        else:
            raise Exception(f"{name} analyzer doesn't exist")

    def set_mask(self, name: str):
        """
        Set masks

        :param name: wanted mask
        """
        if name in self.MASKS:
            self._mask = self.MASKS.get(name)

        else:
            raise Exception(f"{name} mask doesn't exist")

    def set_annotator(self, name: str):
        """
        Set annotator

        :param name: wanted annotator
        """
        if name in self.ANNOTATORS:
            self._annotator = self.ANNOTATORS.get(name)
        else:
            raise Exception(f"{name} annotator doesn't exist")

    def anonymize(self, text: str, infos: PersonalInfo = None) -> str:
        """
        Global function to anonymise a text base on the choosen strategies

        :param text: text to anonymize
        :returns: anonimized text
        """
        if not text:
            return "NaN"

        self._entities = []
        resolved_infos = infos if infos is not None else self._infos
        anonymized_text = text

        for strategy in self._analyzers:
            spans = strategy.analyze(text=anonymized_text, info=resolved_infos)

            for positions, replacement in spans.items():
                for start, end in positions:
                    self._entities.append(DetectedEntity(
                        original=anonymized_text[start:end],
                        replacement=replacement,
                        type=replacement.strip("<>"),
                        start=start,
                        end=end
                    ))

            anonymized_text = self._mask.mask(anonymized_text, spans)

        self._entities.sort(key=lambda e: e.start)
        return anonymized_text

    def get_entities(self) -> list[DetectedEntity]:
        """
        Function to get matched entites in anonymisation.

        :returns: DetectedEntity class list
        """
        return self._entities


    def annotate(self, text: str) -> str:
        """
        Global function to annotate a text base on the choosen strategies

        :param text: text to annotate
        :returns: annonated text
        """
        spans = {}
        for strategy in self._analyzers:
            strategy.info = self._infos
            span = strategy.analyze(text=text)
            spans.update(span)
        if self._annotator:
            annotated_text = self._annotator.annotate(text, spans)
        return annotated_text

add_analyzer(name)

Add analyser

:param name: analyzer used for anonymisation

Source code in incognito_anonymizer/anonymizer.py
117
118
119
120
121
122
123
124
125
126
127
128
129
def add_analyzer(self, name: str):
    """
    Add analyser

    :param name: analyzer used for anonymisation

    """
    if name in self.ANALYZERS:
        analyzer = self.ANALYZERS.get(name)
        if analyzer not in self._analyzers:
            self._analyzers.append(analyzer)
    else:
        raise Exception(f"{name} analyzer doesn't exist")

annotate(text)

Global function to annotate a text base on the choosen strategies

:param text: text to annotate :returns: annonated text

Source code in incognito_anonymizer/anonymizer.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def annotate(self, text: str) -> str:
    """
    Global function to annotate a text base on the choosen strategies

    :param text: text to annotate
    :returns: annonated text
    """
    spans = {}
    for strategy in self._analyzers:
        strategy.info = self._infos
        span = strategy.analyze(text=text)
        spans.update(span)
    if self._annotator:
        annotated_text = self._annotator.annotate(text, spans)
    return annotated_text

anonymize(text, infos=None)

Global function to anonymise a text base on the choosen strategies

:param text: text to anonymize :returns: anonimized text

Source code in incognito_anonymizer/anonymizer.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def anonymize(self, text: str, infos: PersonalInfo = None) -> str:
    """
    Global function to anonymise a text base on the choosen strategies

    :param text: text to anonymize
    :returns: anonimized text
    """
    if not text:
        return "NaN"

    self._entities = []
    resolved_infos = infos if infos is not None else self._infos
    anonymized_text = text

    for strategy in self._analyzers:
        spans = strategy.analyze(text=anonymized_text, info=resolved_infos)

        for positions, replacement in spans.items():
            for start, end in positions:
                self._entities.append(DetectedEntity(
                    original=anonymized_text[start:end],
                    replacement=replacement,
                    type=replacement.strip("<>"),
                    start=start,
                    end=end
                ))

        anonymized_text = self._mask.mask(anonymized_text, spans)

    self._entities.sort(key=lambda e: e.start)
    return anonymized_text

get_entities()

Function to get matched entites in anonymisation.

:returns: DetectedEntity class list

Source code in incognito_anonymizer/anonymizer.py
186
187
188
189
190
191
192
def get_entities(self) -> list[DetectedEntity]:
    """
    Function to get matched entites in anonymisation.

    :returns: DetectedEntity class list
    """
    return self._entities

open_json_file(path)

Open input json file for personal infos

:param path: path of the json file :returns: file content :raises FileExistsError: if given file not found

Source code in incognito_anonymizer/anonymizer.py
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def open_json_file(self, path: str) -> str:
    """
    Open input json file for personal infos

    :param path: path of the json file
    :returns: file content
    :raises FileExistsError: if given file not found
    """
    try:
        with open(path, "r") as f:
            data = json.load(f)
        return data
    except FileNotFoundError as e:
        print(e)
        raise

open_text_file(path)

Open input txt file

:param path: path of the input txt file :returns: file content :raises FileExistsError: if given file not found

Source code in incognito_anonymizer/anonymizer.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def open_text_file(self, path: str) -> str:
    """
    Open input txt file

    :param path: path of the input txt file
    :returns: file content
    :raises FileExistsError: if given file not found
    """
    try:
        with open(path, "r") as f:
            content = f.read()
        return content
    except FileNotFoundError as e:
        print(e)
        raise

set_annotator(name)

Set annotator

:param name: wanted annotator

Source code in incognito_anonymizer/anonymizer.py
143
144
145
146
147
148
149
150
151
152
def set_annotator(self, name: str):
    """
    Set annotator

    :param name: wanted annotator
    """
    if name in self.ANNOTATORS:
        self._annotator = self.ANNOTATORS.get(name)
    else:
        raise Exception(f"{name} annotator doesn't exist")

set_info(infos)

Set personal info

:param infos: PersonalInfo

Source code in incognito_anonymizer/anonymizer.py
87
88
89
90
91
92
93
94
def set_info(self, infos: PersonalInfo):
    """
    Set personal info

    :param infos: PersonalInfo
    """
    self._infos = infos
    return infos

set_info_from_dict(**kwargs)

Set dict to PersonalInfo Class

:param infos: dict with all the Personal info values

Source code in incognito_anonymizer/anonymizer.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def set_info_from_dict(self, **kwargs):
    """
    Set dict to PersonalInfo Class

    :param infos: dict with all the Personal info values

    """
    clean_data = {
        k: (
            ""
            if v is None
            else v.strftime("%Y-%m-%d")
            if isinstance(v, datetime)
            else v
        )
        for k, v in kwargs.items()
    }
    info_obj = PersonalInfo(**clean_data)
    self._infos = info_obj
    return info_obj

set_mask(name)

Set masks

:param name: wanted mask

Source code in incognito_anonymizer/anonymizer.py
131
132
133
134
135
136
137
138
139
140
141
def set_mask(self, name: str):
    """
    Set masks

    :param name: wanted mask
    """
    if name in self.MASKS:
        self._mask = self.MASKS.get(name)

    else:
        raise Exception(f"{name} mask doesn't exist")

Analyzer

AnalyzerStrategy

Constructeur de la Class Strategy

Source code in incognito_anonymizer/analyzer.py
23
24
25
26
27
class AnalyzerStrategy:
    """Constructeur de la Class Strategy"""

    def analyze(text: str, info: PersonalInfo = None):
        raise NotImplementedError()

LossyStrategy

Bases: RegexStrategy

Find word position based on regex

:param text: text to anonymise :returns: List of tuples where each tuple contains: - A tuple with the start and end positions of the word. - The replacement string. .. warning:: This strategy is intentionally lossy: it trades recall precision for maximum anonymization coverage. Information loss is expected and assumed.

Source code in incognito_anonymizer/analyzer.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
class LossyStrategy(RegexStrategy):
    """
    Find word position based on regex

    :param text: text to anonymise
    :returns: List of tuples where each tuple contains:
            - A tuple with the start and end positions of the word.
            - The replacement string.
    .. warning::
    This strategy is intentionally lossy: it trades recall precision for
    maximum anonymization coverage. Information loss is expected and assumed.
    """

    def __init__(self):
        super().__init__()
        # self.title_regex = r"([Dd][Rr][.]?|[Dd]octeur|[mM]r?[.]?|[Ii]nterne[ ]*:?|INT|[Ee]xterne[ ]*:?|[Mm]onsieur|[Mm]adame|[Rr].f.rent[ ]*:?|[P][Rr][.]?|[Pp]rofesseure|[Pp]rofesseur|[Mm]me[.]?|[Ee]nfant|[Mm]lle|[Nn]ée?|[Cc]hef(fe)? de service|[Nn]om :)"
        self.LOSSY_PATTERNS = {
            # DUPONT Martin ou DUPONT de TOTO Martin ou DUPONT-TOTO Martin
            r"([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*){2,}([ \t]+([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*|de|du|des|von|van|le|la)){0,3}[ \t]+[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,}(-[A-Z][a-z-éèçùàâêîôûëïü]{2,})*": "<NAME>",
            # Martin DUPONT ou Martin DUPONT de TOTO ou Martin DUPONT-TOTO
            r"[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,})*[ \t]+([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*){2,}([ \t]+([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*|de|du|des|von|van|le|la)){0,3}": "<NAME>",
            # J. Pierre ou J.P. Marie
            r"([A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]\.){1,3}[ \t]*[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,})*": "<NAME>",
            # DUPONT Jean-Philippe ou DUPONT Jean Philippe (prénom composé avec ou sans trait d'union)
            r"([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*){2,}([ \t]+([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*|de|du|des|von|van|le|la)){0,3}[ \t]+[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,})*([ \t]+[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,})*)+": "<NAME>",
            # L Philippe ou L. Philippe (initiale suivie d'un prénom)
            r"\b[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]\.?[ \t]+[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,})*": "<NAME>",
            # Philippe LOC'H (prénom suivi d'un nom avec apostrophe)
            r"[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{2,})*[ \t]+([A-Z][A-Z'-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*){2,}": "<NAME>",
            # DUPONT Martin ou DUPONT de TOTO Martin
            rf"(?:{self.title_regex}[ \t\n]+)?([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*){"{2,}"}([ \t]+([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*|de|du|des|von|van|le|la)){{0,3}}[ \t]+[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}}(-[A-Z][a-z-éèçùàâêîôûëïü]{{2,}})*": "<NAME>",
            # Martin DUPONT
            rf"(?:{self.title_regex}[ \t\n]+)?[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}})*[ \t]+([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*){{2,}}([ \t]+([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*|de|du|des|von|van|le|la)){{0,3}}": "<NAME>",
            # J. Pierre ou J.P. Marie
            rf"(?:{self.title_regex}[ \t\n]+)?([A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]\.){{1,3}}[ \t]*[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}})*": "<NAME>",
            # DUPONT Jean-Philippe
            rf"(?:{self.title_regex}[ \t\n]+)?([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*){{2,}}([ \t]+([A-Z][A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*|de|du|des|von|van|le|la)){{0,3}}[ \t]+[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}})*([ \t]+[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}})*)+": "<NAME>",
            # L. Philippe
            rf"(?:{self.title_regex}[ \t\n]+)?\b[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]\.?[ \t]+[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}})*": "<NAME>",
            # Philippe LOC'H
            rf"(?:{self.title_regex}[ \t\n]+)?[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}}(-[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{{2,}})*[ \t]+([A-Z][A-Z'-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*){{2,}}": "<NAME>",
            # B. ALBERT (initiale + point + nom en majuscules)
            rf"(?:{self.title_regex}[ \t\n]+)?[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]\.[ \t]+[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]{{2,}}([A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]*)": "<NAME>",

        }
    def multi_subs_by_regex(self, text: str) -> Dict[Tuple[Tuple[int, int]], str]:
        """
        Analyze text using an aggressive uppercase-based matching strategy.

        :param text: Text to anonymize.
        :returns: Dictionary mapping span tuples to replacement strings.

        .. warning::
            This strategy can suppress legitimate content. Any token matching
            the uppercase pattern will be replaced, regardless of whether it
            is actually a personal identifier.
        """

        self.position = {}
        text = text.replace('\x7f', '')
        for pattern, repl in self.LOSSY_PATTERNS.items():
            matches_iter = list(regex.finditer(pattern, text, overlapped=True))
            if not matches_iter:
                continue

            spans = [match.span() for match in matches_iter]
            filtered_spans = self._remove_overlapping_spans(spans)
            existing_keys = list(self.position.keys())

            overlapping_keys = [
                key
                for key in existing_keys
                if any(span in key for span in filtered_spans)
                or any(k in filtered_spans for k in key)
            ]

            if overlapping_keys:
                combined_key = tuple(
                    sorted(
                        set(span for key in overlapping_keys for span in key).union(
                            filtered_spans
                        )
                    )
                )
                for key in overlapping_keys:
                    del self.position[key]
                self.position[combined_key] = repl
            else:
                self.position[tuple(filtered_spans)] = repl

        self.position = self._resolve_position_conflicts(self.position)
        return self.position

    def analyze(self, text: str, info: PersonalInfo = None):
        """
        Hide text using regular expression
        :param text: text to anonymize
        """
        warnings.warn(
            "LossyStrategy.analyze() uses aggressive pattern matching that may cause "
            "unintended information loss. Tokens matching the uppercase pattern will be "
            "replaced unconditionally, including potential false positives such as "
            "acronyms, place names, or medical terminology. "
            "Use a more precise strategy if data integrity is critical.",
            UserWarning,
            stacklevel=2,
        )
        return self.multi_subs_by_regex(text)

analyze(text, info=None)

Hide text using regular expression :param text: text to anonymize

Source code in incognito_anonymizer/analyzer.py
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
def analyze(self, text: str, info: PersonalInfo = None):
    """
    Hide text using regular expression
    :param text: text to anonymize
    """
    warnings.warn(
        "LossyStrategy.analyze() uses aggressive pattern matching that may cause "
        "unintended information loss. Tokens matching the uppercase pattern will be "
        "replaced unconditionally, including potential false positives such as "
        "acronyms, place names, or medical terminology. "
        "Use a more precise strategy if data integrity is critical.",
        UserWarning,
        stacklevel=2,
    )
    return self.multi_subs_by_regex(text)

multi_subs_by_regex(text)

Analyze text using an aggressive uppercase-based matching strategy.

:param text: Text to anonymize. :returns: Dictionary mapping span tuples to replacement strings.

.. warning:: This strategy can suppress legitimate content. Any token matching the uppercase pattern will be replaced, regardless of whether it is actually a personal identifier.

Source code in incognito_anonymizer/analyzer.py
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
def multi_subs_by_regex(self, text: str) -> Dict[Tuple[Tuple[int, int]], str]:
    """
    Analyze text using an aggressive uppercase-based matching strategy.

    :param text: Text to anonymize.
    :returns: Dictionary mapping span tuples to replacement strings.

    .. warning::
        This strategy can suppress legitimate content. Any token matching
        the uppercase pattern will be replaced, regardless of whether it
        is actually a personal identifier.
    """

    self.position = {}
    text = text.replace('\x7f', '')
    for pattern, repl in self.LOSSY_PATTERNS.items():
        matches_iter = list(regex.finditer(pattern, text, overlapped=True))
        if not matches_iter:
            continue

        spans = [match.span() for match in matches_iter]
        filtered_spans = self._remove_overlapping_spans(spans)
        existing_keys = list(self.position.keys())

        overlapping_keys = [
            key
            for key in existing_keys
            if any(span in key for span in filtered_spans)
            or any(k in filtered_spans for k in key)
        ]

        if overlapping_keys:
            combined_key = tuple(
                sorted(
                    set(span for key in overlapping_keys for span in key).union(
                        filtered_spans
                    )
                )
            )
            for key in overlapping_keys:
                del self.position[key]
            self.position[combined_key] = repl
        else:
            self.position[tuple(filtered_spans)] = repl

    self.position = self._resolve_position_conflicts(self.position)
    return self.position

PiiStrategy

Bases: AnalyzerStrategy

Detect personal infos

Source code in incognito_anonymizer/analyzer.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
class PiiStrategy(AnalyzerStrategy):
    """Detect personal infos"""

    def __init__(self):
        pass

    def hide_by_keywords(
        self, text: str, keywords: Iterable[Tuple[str, str]]
    ) -> Dict[Tuple[int, int], str]:
        """
        Hide text using keywords and return positions with replacements.

        :param text: text to anonymize
        :param keywords: Iterable of tuples (word, replacement).


        :returns: List of tuples where each tuple contains:
                - A tuple with the start and end positions of the word.
                - The replacement string.
        """
        processor = KeywordProcessor(case_sensitive=False)
        for key, masks in keywords:
            key = "".join(
                (
                    c
                    for c in unicodedata.normalize("NFD", key)
                    if unicodedata.category(c) != "Mn"
                )
            )
            processor.add_keyword(key, masks)

        normalized_text = "".join(
            (
                c
                for c in unicodedata.normalize("NFD", text)
                if unicodedata.category(c) != "Mn"
            )
        )
        # Extract keywords with positions
        found_keywords = processor.extract_keywords(normalized_text, span_info=True)

        result = {}
        for replacement, start, end in found_keywords:
            # Wrap positions as a tuple of tuples
            key = ((start, end),)
            # if key in result:
            #     result[key] = replacement  # Handle multiple occurrences
            # else:
            result[key] = replacement
        return result

    def analyze(self, text: str, info: PersonalInfo = None) -> str:
        """
        Hide specific words based on keywords

        :param text: text to anonymize
        """
        keywords: tuple
        if not isinstance(info, PersonalInfo):
            print("info must be a Personnal info type. Returning empty dict instead.")
            return {}
        keywords = (
            (info.first_name, "<NAME>"),
            (info.last_name, "<NAME>"),
            (info.birth_name, "<NAME>"),
            (info.ipp, "<IPP>"),
            (info.iep, "<IEP>"),
            (info.postal_code, "<CODE_POSTAL>"),
            (info.birthdate.strftime("%m/%d/%Y"), "<DATE>"),
            (info.birthdate.strftime("%m %d %Y"), "<DATE>"),
            (info.birthdate.strftime("%m:%d:%Y"), "<DATE>"),
            (info.birthdate.strftime("%m-%d-%Y"), "<DATE>"),
            (info.birthdate.strftime("%Y-%m-%d"), "<DATE>"),
            (info.birthdate.strftime("%d/%m/%Y"), "<DATE>"),
            (info.adress, "<ADRESSE>"),
        )

        return self.hide_by_keywords(text, [(k, t) for k, t in keywords if k])

analyze(text, info=None)

Hide specific words based on keywords

:param text: text to anonymize

Source code in incognito_anonymizer/analyzer.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def analyze(self, text: str, info: PersonalInfo = None) -> str:
    """
    Hide specific words based on keywords

    :param text: text to anonymize
    """
    keywords: tuple
    if not isinstance(info, PersonalInfo):
        print("info must be a Personnal info type. Returning empty dict instead.")
        return {}
    keywords = (
        (info.first_name, "<NAME>"),
        (info.last_name, "<NAME>"),
        (info.birth_name, "<NAME>"),
        (info.ipp, "<IPP>"),
        (info.iep, "<IEP>"),
        (info.postal_code, "<CODE_POSTAL>"),
        (info.birthdate.strftime("%m/%d/%Y"), "<DATE>"),
        (info.birthdate.strftime("%m %d %Y"), "<DATE>"),
        (info.birthdate.strftime("%m:%d:%Y"), "<DATE>"),
        (info.birthdate.strftime("%m-%d-%Y"), "<DATE>"),
        (info.birthdate.strftime("%Y-%m-%d"), "<DATE>"),
        (info.birthdate.strftime("%d/%m/%Y"), "<DATE>"),
        (info.adress, "<ADRESSE>"),
    )

    return self.hide_by_keywords(text, [(k, t) for k, t in keywords if k])

hide_by_keywords(text, keywords)

Hide text using keywords and return positions with replacements.

:param text: text to anonymize :param keywords: Iterable of tuples (word, replacement).

:returns: List of tuples where each tuple contains: - A tuple with the start and end positions of the word. - The replacement string.

Source code in incognito_anonymizer/analyzer.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def hide_by_keywords(
    self, text: str, keywords: Iterable[Tuple[str, str]]
) -> Dict[Tuple[int, int], str]:
    """
    Hide text using keywords and return positions with replacements.

    :param text: text to anonymize
    :param keywords: Iterable of tuples (word, replacement).


    :returns: List of tuples where each tuple contains:
            - A tuple with the start and end positions of the word.
            - The replacement string.
    """
    processor = KeywordProcessor(case_sensitive=False)
    for key, masks in keywords:
        key = "".join(
            (
                c
                for c in unicodedata.normalize("NFD", key)
                if unicodedata.category(c) != "Mn"
            )
        )
        processor.add_keyword(key, masks)

    normalized_text = "".join(
        (
            c
            for c in unicodedata.normalize("NFD", text)
            if unicodedata.category(c) != "Mn"
        )
    )
    # Extract keywords with positions
    found_keywords = processor.extract_keywords(normalized_text, span_info=True)

    result = {}
    for replacement, start, end in found_keywords:
        # Wrap positions as a tuple of tuples
        key = ((start, end),)
        # if key in result:
        #     result[key] = replacement  # Handle multiple occurrences
        # else:
        result[key] = replacement
    return result

RegexStrategy

Bases: AnalyzerStrategy

Detect word based on regex

Source code in incognito_anonymizer/analyzer.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
class RegexStrategy(AnalyzerStrategy):
    """Detect word based on regex"""

    def __init__(self):
        super().__init__()
        Xxxxx = r"[A-ZÀ-Ÿ]\p{Ll}+"
        XXxX_ = r"[A-ZÀ-Ÿ][A-ZÀ-Ÿ\p{Ll}-]"
        XXxX_apostrophe = r"[A-ZÀ-Ÿ][A-ZÀ-Ÿ\p{Ll}-]*(?:[''][A-ZÀ-Ÿ][A-ZÀ-Ÿ\p{Ll}-]*)?"

        sep = r"(?:[ ]*|-)?"
        mois = r"(?i)(?:janvier|février|fevrier|mars|avril|mai|juin|juillet|août|aout|septembre|octobre|novembre|décembre|decembre|janv?[.]?|févr?[.]?|fevr?[.]?|avr[.]?|juil[.]?|sept?[.]?|oct[.]?|nov[.]?|déc[.]?|dec[.]?)"

        # Date complète littérale : "8 juillet 2020"
        self.date_litteral_full = (
            rf"\b(0?[1-9]|[12]\d|3[01])[\s]+{mois}[\s,]+((?:1[6-9]|[2-9]\d)\d{{2}})\b"
        )

        # Date partielle sans année : "20 mars"
        self.date_litteral_partial = rf"\b(0?[1-9]|[12]\d|3[01])[\s]+{mois}\b"

        # Mois seul : "juillet 2020" ou juste "juillet"
        self.mois_pattern = rf"\b{mois}(?:[\s]+((?:1[6-9]|[2-9]\d)\d{{2}}))?\b"
        self.title_regex = r"([Dd][Rr][.]?|[Dd]octeur|[mM]r?[.]?|[Ii]nterne[ ]*:?|INT|[Ee]xterne[ ]*:?|[Mm]onsieur|[Mm]adame|[Rr].f.rent[ ]*:?|[P][Rr][.]?|[Pp]rofesseure|[Pp]rofesseur|[Mm]me[.]?|[Ee]nfant|[Mm]lle|[Nn]ée?|[Cc]hef(fe)? de service|[Nn]om :)"

        self.email_pattern = (
            r"(?i)"
            r"(?:"
            r"[a-z0-9!#$%&'*+/=?^_`{|}~<>()\[\]\\:;,@\"\-]+"  # partie locale étendue
            r"(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~<>()\[\]\\:;,@\"\-]+)*"
            r"|"
            r"\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\""
            r")"
            r"@"
            r"(?:"
            r"(?:[a-z0-9*<>()\[\]!#$%&'+=?^_`{|}~-](?:[a-z0-9*<>()\[\]!#$%&'+=?^_`{|}~-]*[a-z0-9*<>()\[\]!#$%&'+=?^_`{|}~-])?\.)*"
            r"[a-z0-9*<>()\[\]!#$%&'+=?^_`{|}~-](?:[a-z0-9*<>()\[\]!#$%&'+=?^_`{|}~-]*[a-z0-9*<>()\[\]!#$%&'+=?^_`{|}~-])?"
            r"|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}"
            r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:"
            r"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\]"
            r")"
        )
        # needs a comma  or \r to match. If it's in a middle of a phrase it won't match
        self.adresse_pattern = r"(?i)\d{1,4}\s*(?:bis|ter|quater)?\s+(?:rue|avenue|av\.|boulevard|bd\.?|impasse|allée|allee|chemin|route|place|square|résidence|residence|hameau|lieu[- ]dit|voie|passage|villa|domaine|lotissement|parc|traverse|ruelle|sentier|cours|quai|esplanade)\s+[a-z0-9éèàùâêîôûïëüçæœ'\-\.]+(?:\s+[a-z0-9éèàùâêîôûïëüçæœ'\-\.]+){0,10},?\s*\d{5},?\s*[a-zéèàùâêîôûïëüçæœ'\-\.]+(?:\s+[a-zéèàùâêîôûïëüçæœ'\-\.]+){0,5}(?=\s*[,\{\n]|$)"

        # INFO: Non restrictive regexp for matching 3 word after a street description.
        self.fast_adresse_pattern = r"(?i)(?:\d+\s+)?(rue|avenue|av|boulevard|bd|bld|allée|allee|impasse|chemin|route|place|square|villa|passage|domaine|hameau|lotissement|résidence|residence|quartier|sentier|traverse|cours|quai|esplanade|promenade|rond[- ]point)\b(?:\s+\S+){1,3}"

        self.zip_city_name = (
            r"\b(\d{5})\s+([A-ZÀÂÉÈÊËÎÏÔÙÛÜÇ][A-ZÀÂÉÈÊËÎÏÔÙÛÜÇ\s\-]+)\b"
        )
        self.PATTERNS = {
            # rf"(?<={self.title_regex})([\s-][A-Z]+)+([\s-][A-Z][a-z]+)+(?![a-z])": "<NAME>",
            rf"(?P<TITLE>{self.title_regex}[ \n]+)(?P<LN0>[A-ZÀ-Ÿ][A-ZÀ-Ÿ](?:{sep}(?:ep[.]|de|[A-ZÀ-Ÿ]+))*)[ ]+(?P<FN0>{Xxxxx}(?:{sep}{Xxxxx})*)": "<NAME>",
            rf"(?P<TITLE>{self.title_regex}[ \n]+)(?P<FN1>{Xxxxx}(?:{sep}{Xxxxx})*)[ ]+(?P<LN1>[A-ZÀ-Ÿ][A-ZÀ-Ÿ]+(?:{sep}(?:ep[.]|de|[A-ZÀ-Ÿ]+))*)": "<NAME>",
            rf"(?P<TITLE>{self.title_regex}[ \n]+)(?P<LN3>{Xxxxx}(?:(?:-|[ ]de[ ]|[ ]ep[.][ ]){Xxxxx})*)[ ]+(?P<FN2>{Xxxxx}(?:-{Xxxxx})*)": "<NAME>",
            rf"(?P<TITLE>{self.title_regex}[ \n]+)(?P<LN2>{XXxX_}+(?:{sep}{XXxX_}+)*)": "<NAME>",
            rf"(?P<TITLE>{self.title_regex}[ \n]+)(?P<FN0>[A-ZÀ-Ÿ][.])[ \t]+(?P<LN0>{XXxX_}+(?:{sep}{XXxX_}+)*)": "<NAME>",
            rf"(?P<TITLE>{self.title_regex}[ \n]+)(?P<FN0>[A-ZÀ-Ÿ][.](?:[A-ZÀ-Ÿ][.])*)\s+(?P<LN0>{XXxX_apostrophe}+(?:{sep}{XXxX_apostrophe}+)*)": "<NAME>",
            rf"(?P<TITLE>{self.title_regex}[ \n]+)(?P<FN0>[A-ZÀ-Ÿ][.](?:[A-ZÀ-Ÿ][.])*)\s+(?:de |d'|du |des )?(?P<LN0>{XXxX_apostrophe}+(?:{sep}{XXxX_apostrophe}+)*)": "<NAME>",
            # r"[12]\s*[0-9]{2}\s*(0[1-9]|1[0-2])\s*(2[AB]|[0-9]{2})\s*[0-9]{3}\s*[0-9]{3}\s*(?:\(?([0-9]{2})\)?)?": "<NIR>",
            # r"(?:(?:\+|00)33[\s.-]*|0)[\s.-]*[1-9](?:[\s.-]*\d{2}){4}": "<PHONE>",
            self.date_litteral_full: "<DATE>",  # 8 juillet 2020  ← plus spécifique en premier
            self.date_litteral_partial: "<DATE>",  # 20 mars
            self.mois_pattern: "<DATE>",
            r"\b(0?[1-9]|[12]\d|3[01])(\/|-|\.)(0?[1-9]|1[0-2])\2((?:(?:1[6-9]|[2-9]\d)\d{2}|\d{2}))\b": "<DATE>",
            self.email_pattern: "<EMAIL>",
            self.adresse_pattern: "<ADRESSE>",
            self.zip_city_name: "<ADRESSE>",
            self.mois_pattern: "<DATE>",
            r"(?:(?:\+|00)33[\s.-]*|0)[\s.-]*[1-9](?:[\s.-]*\d{2}){4}|\(?\d[\d\s]{6,}\d": "<NUMBER>",
            self.fast_adresse_pattern: "<ADRESSE>",
            # r"[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ]{4,}\s+[A-Z-ÉÈÀÂÊÎÔÛËÏÜÙÇ][a-z-éèçùàâêîôûëïü]{4,}\s": "<NAME>",
        }


    def _replace(self, match):
        title = match.group('TITLE') if 'TITLE' in match.groupdict() else ''
        return title + "<NAME>"

    def multi_subs_by_regex(self, text: str) -> Dict[Tuple[Tuple[int, int]], str]:
        """
        Find word position based on regex

        :param text: text to anonymise
        :returns: List of tuples where each tuple contains:
                - A tuple with the start and end positions of the word.
                - The replacement string.
        """

        self.position = {}

        for pattern, repl in self.PATTERNS.items():
            matches_iter = list(regex.finditer(pattern, text, overlapped=True))
            if not matches_iter:
                continue

            # spans = [match.span() for match in matches_iter]
            spans = []
            for match in matches_iter:
                groups = match.groupdict()
                # Si on a des groupes nommés LN/FN, on prend uniquement leur span
                name_groups = [k for k in groups if (k.startswith('LN') or k.startswith('FN')) and groups[k] is not None]
                if name_groups:
                    # Prendre le span englobant tous les groupes LN/FN
                    start = min(match.start(g) for g in name_groups)
                    end = max(match.end(g) for g in name_groups)
                    spans.append((start, end))
                else:
                    spans.append(match.span())
            # Dédoublonnage : pour les spans overlappants, garder uniquement le plus long
            filtered_spans = self._remove_overlapping_spans(spans)
            existing_keys = list(self.position.keys())
            overlapping_keys = []
            for key in existing_keys:
                if any(span in key for span in filtered_spans) or any(
                    k in filtered_spans for k in key
                ):
                    overlapping_keys.append(key)

            if overlapping_keys:
                combined_key = tuple(
                    sorted(
                        set(span for key in overlapping_keys for span in key).union(
                            filtered_spans
                        )
                    )
                )
                for key in overlapping_keys:
                    del self.position[key]
                self.position[combined_key] = repl
            else:
                self.position[tuple(filtered_spans)] = repl

        result = {}
        for k, v in self.position.items():
            if v != "<EMAIL>":
                result[k] = v
                continue

            email_tuples = list(k)
            ends = {}

            for start, end in email_tuples:
                length = end - start
                if end not in ends or length > (ends[end][1] - ends[end][0]):
                    ends[end] = (start, end)

            result[tuple(ends.values())] = "<EMAIL>"

        self.position = self._resolve_position_conflicts(result)
        return self.position

    def analyze(self, text: str, info: PersonalInfo = None):
        """
        Hide text using regular expression
        :param text: text to anonymize
        """
        return self.multi_subs_by_regex(text)

    def _remove_overlapping_spans(self, spans: list) -> list:
        """
        Pour un ensemble de spans potentiellement overlappants,
        ne garder que les spans non-overlappants les plus longs.
        """
        if not spans:
            return spans

        # Trier par longueur décroissante (garder les plus longs en priorité)
        sorted_spans = sorted(spans, key=lambda s: s[1] - s[0], reverse=True)

        kept = []
        for span in sorted_spans:
            start, end = span
            # Vérifier si ce span overlap avec un span déjà gardé
            overlaps = any(
                not (end <= kept_start or start >= kept_end)
                for kept_start, kept_end in kept
            )
            if not overlaps:
                kept.append(span)

        # Retrier par position
        return sorted(kept, key=lambda s: s[0])

    def _spans_overlap(self, span1: Tuple[int, int], span2: Tuple[int, int]) -> bool:
        """Vérifie si deux spans se chevauchent"""
        return not (span1[1] <= span2[0] or span2[1] <= span1[0])

    def _resolve_position_conflicts(
        self, positions: Dict[Tuple[Tuple[int, int]], str]
    ) -> Dict[Tuple[Tuple[int, int]], str]:
        """
        Pour des clés de position qui se chevauchent avec la même valeur,
        ne garder que la clé avec le span le plus large.

        :param positions: dict avec des tuples de spans comme clés et des remplacements comme valeurs
        :returns: dict filtré sans conflits de positions
        """
        result = dict(positions)
        keys = list(result.keys())
        to_delete = set()

        for i, key1 in enumerate(keys):
            if key1 in to_delete:
                continue
            for key2 in keys[i + 1 :]:
                if key2 in to_delete:
                    continue
                if result[key1] != result[key2]:
                    continue

                has_overlap = any(
                    self._spans_overlap(s1, s2) for s1 in key1 for s2 in key2
                )

                if has_overlap:
                    len1 = max(end - start for start, end in key1)
                    len2 = max(end - start for start, end in key2)
                    to_delete.add(key1 if len1 < len2 else key2)

        for key in to_delete:
            del result[key]

        return result

analyze(text, info=None)

Hide text using regular expression :param text: text to anonymize

Source code in incognito_anonymizer/analyzer.py
262
263
264
265
266
267
def analyze(self, text: str, info: PersonalInfo = None):
    """
    Hide text using regular expression
    :param text: text to anonymize
    """
    return self.multi_subs_by_regex(text)

multi_subs_by_regex(text)

Find word position based on regex

:param text: text to anonymise :returns: List of tuples where each tuple contains: - A tuple with the start and end positions of the word. - The replacement string.

Source code in incognito_anonymizer/analyzer.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
def multi_subs_by_regex(self, text: str) -> Dict[Tuple[Tuple[int, int]], str]:
    """
    Find word position based on regex

    :param text: text to anonymise
    :returns: List of tuples where each tuple contains:
            - A tuple with the start and end positions of the word.
            - The replacement string.
    """

    self.position = {}

    for pattern, repl in self.PATTERNS.items():
        matches_iter = list(regex.finditer(pattern, text, overlapped=True))
        if not matches_iter:
            continue

        # spans = [match.span() for match in matches_iter]
        spans = []
        for match in matches_iter:
            groups = match.groupdict()
            # Si on a des groupes nommés LN/FN, on prend uniquement leur span
            name_groups = [k for k in groups if (k.startswith('LN') or k.startswith('FN')) and groups[k] is not None]
            if name_groups:
                # Prendre le span englobant tous les groupes LN/FN
                start = min(match.start(g) for g in name_groups)
                end = max(match.end(g) for g in name_groups)
                spans.append((start, end))
            else:
                spans.append(match.span())
        # Dédoublonnage : pour les spans overlappants, garder uniquement le plus long
        filtered_spans = self._remove_overlapping_spans(spans)
        existing_keys = list(self.position.keys())
        overlapping_keys = []
        for key in existing_keys:
            if any(span in key for span in filtered_spans) or any(
                k in filtered_spans for k in key
            ):
                overlapping_keys.append(key)

        if overlapping_keys:
            combined_key = tuple(
                sorted(
                    set(span for key in overlapping_keys for span in key).union(
                        filtered_spans
                    )
                )
            )
            for key in overlapping_keys:
                del self.position[key]
            self.position[combined_key] = repl
        else:
            self.position[tuple(filtered_spans)] = repl

    result = {}
    for k, v in self.position.items():
        if v != "<EMAIL>":
            result[k] = v
            continue

        email_tuples = list(k)
        ends = {}

        for start, end in email_tuples:
            length = end - start
            if end not in ends or length > (ends[end][1] - ends[end][0]):
                ends[end] = (start, end)

        result[tuple(ends.values())] = "<EMAIL>"

    self.position = self._resolve_position_conflicts(result)
    return self.position

Mask

FakeStrategy

Bases: Strategy

Replace word by natural placeholder

Source code in incognito_anonymizer/mask.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
class FakeStrategy(Strategy):
    """Replace word by natural placeholder"""

    def __init__(self):
        self.natural_placehodler = {
            "<PER>": "Margaret Hamilton",
            "<NAME>": "Margaret Hamilton",
            "<CODE_POSTAL>": "42000",
            "<DATE>": "1970/01/01",
            "<IPP>": "IPPPH:0987654321",
            "<NIR>": "012345678987654",
            "<EMAIL>": "place.holder@anonymization.cdc",
            "<PHONE>": "0611223344",
            "<ADRESSE>": "35 Rue Margaret Hamilton",
            "<NUMBER>": "123456789"
        }

    def mask(self, text: str, coordinate: Dict[List[Tuple], str]) -> str:
        """
        Replace in text, words at the given coordinates by a natural palceholder.

        :param test: text to anonymize
        :param coordinate: position and placehoder of the word to replace
        :returns: anonymzed text

        Example :
        >>> anonymizer = FakeStrategy()
        >>> text = "Bob"
        >>> coordinate = {((0,3),): '<NAME>',}
        >>> anonymizer.mask(text, coordinate)
        'Margaret Hamilton'
        """
        text_as_list = list(text)
        all_positions = []
        for spans, repl in coordinate.items():
            repl = self.natural_placehodler[repl]
            all_positions.extend((start, end, repl) for start, end in spans)

        all_positions.sort(key=lambda x: x[0], reverse=True)
        for start, end, repl in all_positions:
            text_as_list[start:end] = list(repl)
        return "".join(text_as_list)

mask(text, coordinate)

Replace in text, words at the given coordinates by a natural palceholder.

:param test: text to anonymize :param coordinate: position and placehoder of the word to replace :returns: anonymzed text

Example :

anonymizer = FakeStrategy() text = "Bob" coordinate = {((0,3),): '',} anonymizer.mask(text, coordinate) 'Margaret Hamilton'

Source code in incognito_anonymizer/mask.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def mask(self, text: str, coordinate: Dict[List[Tuple], str]) -> str:
    """
    Replace in text, words at the given coordinates by a natural palceholder.

    :param test: text to anonymize
    :param coordinate: position and placehoder of the word to replace
    :returns: anonymzed text

    Example :
    >>> anonymizer = FakeStrategy()
    >>> text = "Bob"
    >>> coordinate = {((0,3),): '<NAME>',}
    >>> anonymizer.mask(text, coordinate)
    'Margaret Hamilton'
    """
    text_as_list = list(text)
    all_positions = []
    for spans, repl in coordinate.items():
        repl = self.natural_placehodler[repl]
        all_positions.extend((start, end, repl) for start, end in spans)

    all_positions.sort(key=lambda x: x[0], reverse=True)
    for start, end, repl in all_positions:
        text_as_list[start:end] = list(repl)
    return "".join(text_as_list)

HashStrategy

Bases: Strategy

Replace les mots par leur hash

Source code in incognito_anonymizer/mask.py
119
120
121
122
123
class HashStrategy(Strategy):
    """Replace les mots par leur hash"""

    # TODO : blake256 8 digits et paper bourrin(20ene de bytes)
    pass

HideStrategy

Bases: Strategy

Replace by *

Source code in incognito_anonymizer/mask.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
class HideStrategy(Strategy):
    """Replace by *"""

    def mask(self, text, coordinate: Dict[List[Tuple], str]) -> str:
        """
        Replace in text, words at the given coordinates by *.
        :param test: text to anonymize
        :param coordinate: position and placehoder of the word to replace
        :returns: anonymzed text

        Example :
        >>> anonymizer = HideStrategy()
        >>> text = "Bob"
        >>> coordinate = {((0,3),): '<NAME>',}
        >>> anonymizer.mask(text, coordinate)
        '********'

        """
        text_as_list = list(text)

        all_positions = []
        for spans, repl in coordinate.items():
            all_positions.extend((start, end, repl) for start, end in spans)

        all_positions.sort(key=lambda x: x[0], reverse=True)
        for start, end, repl in all_positions:
            word_len = end - start
            replacement = "*" * (8 if word_len < 5 else word_len)
            text_as_list[start:end] = list(replacement)
        return "".join(text_as_list)

mask(text, coordinate)

Replace in text, words at the given coordinates by *. :param test: text to anonymize :param coordinate: position and placehoder of the word to replace :returns: anonymzed text

Example :

anonymizer = HideStrategy() text = "Bob" coordinate = {((0,3),): '',} anonymizer.mask(text, coordinate) '**'

Source code in incognito_anonymizer/mask.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def mask(self, text, coordinate: Dict[List[Tuple], str]) -> str:
    """
    Replace in text, words at the given coordinates by *.
    :param test: text to anonymize
    :param coordinate: position and placehoder of the word to replace
    :returns: anonymzed text

    Example :
    >>> anonymizer = HideStrategy()
    >>> text = "Bob"
    >>> coordinate = {((0,3),): '<NAME>',}
    >>> anonymizer.mask(text, coordinate)
    '********'

    """
    text_as_list = list(text)

    all_positions = []
    for spans, repl in coordinate.items():
        all_positions.extend((start, end, repl) for start, end in spans)

    all_positions.sort(key=lambda x: x[0], reverse=True)
    for start, end, repl in all_positions:
        word_len = end - start
        replacement = "*" * (8 if word_len < 5 else word_len)
        text_as_list[start:end] = list(replacement)
    return "".join(text_as_list)

PlaceholderStrategy

Bases: Strategy

Replace by placeholders

Source code in incognito_anonymizer/mask.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class PlaceholderStrategy(Strategy):
    """Replace by placeholders"""

    def mask(self, text, coordinate: Dict[List[Tuple], str]) -> str:
        """
        Replace in text, words at the given coordinates by a placeholder.
        :param test: text to anonymize
        :param coordinate: position and placehoder of the word to replace
        :returns: anonymzed text

        Example :
        >>> anonymizer = PlaceholderStrategy()
        >>> text = "Bob"
        >>> coordinate = {((0,3),): '<NAME>',}
        >>> anonymizer.mask(text, coordinate)
        '<NAME>'

        """
        text_as_list = list(text)

        all_positions = []
        for spans, repl in coordinate.items():
            all_positions.extend((start, end, repl) for start, end in spans)

        all_positions.sort(key=lambda x: x[0], reverse=True)
        for start, end, repl in all_positions:

            text_as_list[start:end] = list(repl)
        return "".join(text_as_list)

mask(text, coordinate)

Replace in text, words at the given coordinates by a placeholder. :param test: text to anonymize :param coordinate: position and placehoder of the word to replace :returns: anonymzed text

Example :

anonymizer = PlaceholderStrategy() text = "Bob" coordinate = {((0,3),): '',} anonymizer.mask(text, coordinate) ''

Source code in incognito_anonymizer/mask.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def mask(self, text, coordinate: Dict[List[Tuple], str]) -> str:
    """
    Replace in text, words at the given coordinates by a placeholder.
    :param test: text to anonymize
    :param coordinate: position and placehoder of the word to replace
    :returns: anonymzed text

    Example :
    >>> anonymizer = PlaceholderStrategy()
    >>> text = "Bob"
    >>> coordinate = {((0,3),): '<NAME>',}
    >>> anonymizer.mask(text, coordinate)
    '<NAME>'

    """
    text_as_list = list(text)

    all_positions = []
    for spans, repl in coordinate.items():
        all_positions.extend((start, end, repl) for start, end in spans)

    all_positions.sort(key=lambda x: x[0], reverse=True)
    for start, end, repl in all_positions:

        text_as_list[start:end] = list(repl)
    return "".join(text_as_list)