Skip to content

Documentation for comparison_level_library

distance_function_level(col_name, distance_function_name, distance_threshold, higher_is_more_similar=True, m_probability=None)

Represents a comparison using a user-provided distance function, where the similarity

Parameters:

Name Type Description Default
col_name str

Input column name

required
distance_function_name str

The name of the distance function

required
distance_threshold Union[int, float]

The threshold to use to assess similarity

required
higher_is_more_similar bool

If True, a higher value of the distance function indicates a higher similarity (e.g. jaro_winkler). If false, a higher value indicates a lower similarity (e.g. levenshtein).

True
m_probability float

Starting value for m probability. Defaults to None.

None

Returns:

Name Type Description
ComparisonLevel ComparisonLevel

A comparison level for a given distance function

Source code in splink/comparison_level_library.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def distance_function_level(
    col_name: str,
    distance_function_name: str,
    distance_threshold: Union[int, float],
    higher_is_more_similar: bool = True,
    m_probability=None,
) -> ComparisonLevel:
    """Represents a comparison using a user-provided distance function,
    where the similarity

    Args:
        col_name (str): Input column name
        distance_function_name (str): The name of the distance function
        distance_threshold (Union[int, float]): The threshold to use to assess
            similarity
        higher_is_more_similar (bool): If True, a higher value of the distance function
            indicates a higher similarity (e.g. jaro_winkler).  If false, a higher
            value indicates a lower similarity (e.g. levenshtein).
        m_probability (float, optional): Starting value for m probability. Defaults to
            None.

    Returns:
        ComparisonLevel: A comparison level for a given distance function
    """
    col = InputColumn(col_name, sql_dialect=_mutable_params["dialect"])

    if higher_is_more_similar:
        operator = ">="
    else:
        operator = "<="

    sql_cond = (
        f"{distance_function_name}({col.name_l()}, {col.name_r()}) "
        f"{operator} {distance_threshold}"
    )
    level_dict = {
        "sql_condition": sql_cond,
        "label_for_charts": f"{distance_function_name} {operator} {distance_threshold}",
    }
    if m_probability:
        level_dict["m_probability"] = m_probability

    return ComparisonLevel(level_dict, sql_dialect=_mutable_params["dialect"])

null_level(col_name)

Represents comparisons where one or both sides of the comparison contains null values so the similarity cannot be evaluated. Assumed to have a partial match weight of zero (null effect on overall match weight)

Parameters:

Name Type Description Default
col_name str

Input column name

required

Returns:

Name Type Description
ComparisonLevel ComparisonLevel

Comparison level

Source code in splink/comparison_level_library.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def null_level(col_name) -> ComparisonLevel:
    """Represents comparisons where one or both sides of the comparison
    contains null values so the similarity cannot be evaluated.
    Assumed to have a partial match weight of zero (null effect on overall match weight)
    Args:
        col_name (str): Input column name
    Returns:
        ComparisonLevel: Comparison level
    """

    col = InputColumn(col_name, sql_dialect=_mutable_params["dialect"])
    level_dict = {
        "sql_condition": f"{col.name_l()} IS NULL OR {col.name_r()} IS NULL",
        "label_for_charts": "Null",
        "is_null_level": True,
    }
    return ComparisonLevel(level_dict, sql_dialect=_mutable_params["dialect"])

exact_match_level(col_name, m_probability=None, term_frequency_adjustments=False)

Source code in splink/comparison_level_library.py
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def exact_match_level(
    col_name, m_probability=None, term_frequency_adjustments=False
) -> ComparisonLevel:

    col = InputColumn(col_name, sql_dialect=_mutable_params["dialect"])
    level_dict = {
        "sql_condition": f"{col.name_l()} = {col.name_r()}",
        "label_for_charts": "Exact match",
    }
    if m_probability:
        level_dict["m_probability"] = m_probability
    if term_frequency_adjustments:
        level_dict["tf_adjustment_column"] = col_name

    return ComparisonLevel(level_dict, sql_dialect=_mutable_params["dialect"])

levenshtein_level(col_name, distance_threshold, m_probability=None)

Represents a comparison using a levenshtein distance function,

Parameters:

Name Type Description Default
col_name str

Input column name

required
distance_threshold Union[int, float]

The threshold to use to assess similarity

required
m_probability float

Starting value for m probability. Defaults to None.

None

Returns:

Name Type Description
ComparisonLevel ComparisonLevel

A comparison level that evaluates the levenshtein similarity

Source code in splink/comparison_level_library.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def levenshtein_level(
    col_name: str,
    distance_threshold: int,
    m_probability=None,
) -> ComparisonLevel:
    """Represents a comparison using a levenshtein distance function,

    Args:
        col_name (str): Input column name
        distance_threshold (Union[int, float]): The threshold to use to assess
            similarity
        m_probability (float, optional): Starting value for m probability. Defaults to
            None.

    Returns:
        ComparisonLevel: A comparison level that evaluates the levenshtein similarity
    """
    lev_name = _mutable_params["levenshtein"]
    return distance_function_level(
        col_name,
        lev_name,
        distance_threshold,
        False,
        m_probability=m_probability,
    )

jaccard_level(col_name, distance_threshold, m_probability=None)

Represents a comparison using a jaccard distance function

Parameters:

Name Type Description Default
col_name str

Input column name

required
distance_threshold Union[int, float]

The threshold to use to assess similarity

required
m_probability float

Starting value for m probability. Defaults to None.

None

Returns:

Name Type Description
ComparisonLevel ComparisonLevel

A comparison level that evaluates the jaccard similarity

Source code in splink/comparison_level_library.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def jaccard_level(
    col_name: str,
    distance_threshold: Union[int, float],
    m_probability=None,
) -> ComparisonLevel:
    """Represents a comparison using a jaccard distance function

    Args:
        col_name (str): Input column name
        distance_threshold (Union[int, float]): The threshold to use to assess
            similarity
        m_probability (float, optional): Starting value for m probability. Defaults to
            None.

    Returns:
        ComparisonLevel: A comparison level that evaluates the jaccard similarity
    """
    return distance_function_level(
        col_name,
        "jaccard",
        distance_threshold,
        True,
        m_probability=m_probability,
    )

else_level(m_probability=None)

Source code in splink/comparison_level_library.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def else_level(
    m_probability=None,
) -> ComparisonLevel:

    if isinstance(m_probability, str):
        raise ValueError(
            "You provided a string for the value of m probability when it should be "
            "numeric.  Perhaps you passed a column name.  Note that you do not need to "
            "pass a column name into the else level."
        )
    level_dict = {
        "sql_condition": "ELSE",
        "label_for_charts": "All other comparisons",
    }
    if m_probability:
        level_dict["m_probability"] = m_probability
    return ComparisonLevel(level_dict)

columns_reversed_level(col_name_1, col_name_2, m_probability=None, tf_adjustment_column=None)

Represents a comparison where the columns are reversed. For example, if surname is in the forename field and vice versa

Parameters:

Name Type Description Default
col_name_1 str

First column, e.g. forename

required
col_name_2 str

Second column, e.g. surname

required
m_probability float

Starting value for m probability. Defaults to None.

None
tf_adjustment_column str

Column to use for term frequency adjustments if an exact match is observed. Defaults to None.

None

Returns:

Name Type Description
ComparisonLevel ComparisonLevel

A comparison level that evaluates the exact match of two columns.

Source code in splink/comparison_level_library.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def columns_reversed_level(
    col_name_1: str, col_name_2: str, m_probability=None, tf_adjustment_column=None
) -> ComparisonLevel:
    """Represents a comparison where the columns are reversed.  For example, if
    surname is in the forename field and vice versa

    Args:
        col_name_1 (str): First column, e.g. forename
        col_name_2 (str): Second column, e.g. surname
        m_probability (float, optional): Starting value for m probability. Defaults to
            None.
        tf_adjustment_column (str, optional): Column to use for term frequency
            adjustments if an exact match is observed. Defaults to None.

    Returns:
        ComparisonLevel: A comparison level that evaluates the exact match of two
            columns.
    """

    col_1 = InputColumn(col_name_1, sql_dialect=_mutable_params["dialect"])
    col_2 = InputColumn(col_name_2, sql_dialect=_mutable_params["dialect"])

    s = f"{col_1.name_l()} = {col_2.name_r()} and {col_1.name_r()} = {col_2.name_l()}"
    level_dict = {
        "sql_condition": s,
        "label_for_charts": "Exact match on reversed cols",
    }
    if m_probability:
        level_dict["m_probability"] = m_probability

    if tf_adjustment_column:
        level_dict["tf_adjustment_column"] = tf_adjustment_column

    return ComparisonLevel(level_dict, sql_dialect=_mutable_params["dialect"])