CookbookΒΆ
This notebook contains a miscellaneous collection of runnable examples illustrating various Splink techniques.
Array columnsΒΆ
Comparing array columnsΒΆ
This example shows how we can use use ArrayIntersectAtSizes
to assess the similarity of columns containing arrays.
import pandas as pd
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on
data = [
{"unique_id": 1, "first_name": "John", "postcode": ["A", "B"]},
{"unique_id": 2, "first_name": "John", "postcode": ["B"]},
{"unique_id": 3, "first_name": "John", "postcode": ["A"]},
{"unique_id": 4, "first_name": "John", "postcode": ["A", "B"]},
{"unique_id": 5, "first_name": "John", "postcode": ["C"]},
]
df = pd.DataFrame(data)
settings = SettingsCreator(
link_type="dedupe_only",
blocking_rules_to_generate_predictions=[
block_on("first_name"),
],
comparisons=[
cl.ArrayIntersectAtSizes("postcode", [2, 1]),
cl.ExactMatch("first_name"),
]
)
linker = Linker(df, settings, DuckDBAPI(), set_up_basic_logging=False)
linker.inference.predict().as_pandas_dataframe()
match_weight | match_probability | unique_id_l | unique_id_r | postcode_l | postcode_r | gamma_postcode | first_name_l | first_name_r | gamma_first_name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -8.287568 | 0.003190 | 4 | 5 | [A, B] | [C] | 0 | John | John | 1 |
1 | -0.287568 | 0.450333 | 3 | 4 | [A] | [A, B] | 1 | John | John | 1 |
2 | -8.287568 | 0.003190 | 3 | 5 | [A] | [C] | 0 | John | John | 1 |
3 | -8.287568 | 0.003190 | 2 | 3 | [B] | [A] | 0 | John | John | 1 |
4 | -0.287568 | 0.450333 | 2 | 4 | [B] | [A, B] | 1 | John | John | 1 |
5 | -8.287568 | 0.003190 | 2 | 5 | [B] | [C] | 0 | John | John | 1 |
6 | -0.287568 | 0.450333 | 1 | 2 | [A, B] | [B] | 1 | John | John | 1 |
7 | -0.287568 | 0.450333 | 1 | 3 | [A, B] | [A] | 1 | John | John | 1 |
8 | 6.712432 | 0.990554 | 1 | 4 | [A, B] | [A, B] | 2 | John | John | 1 |
9 | -8.287568 | 0.003190 | 1 | 5 | [A, B] | [C] | 0 | John | John | 1 |
Blocking on array columnsΒΆ
This example shows how we can use block_on
to block on the individual elements of an array column - that is, pairwise comaprisons are created for pairs or records where any of the elements in the array columns match.
import pandas as pd
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on
data = [
{"unique_id": 1, "first_name": "John", "postcode": ["A", "B"]},
{"unique_id": 2, "first_name": "John", "postcode": ["B"]},
{"unique_id": 3, "first_name": "John", "postcode": ["C"]},
]
df = pd.DataFrame(data)
settings = SettingsCreator(
link_type="dedupe_only",
blocking_rules_to_generate_predictions=[
block_on("postcode", arrays_to_explode=["postcode"]),
],
comparisons=[
cl.ArrayIntersectAtSizes("postcode", [2, 1]),
cl.ExactMatch("first_name"),
]
)
linker = Linker(df, settings, DuckDBAPI(), set_up_basic_logging=False)
linker.inference.predict().as_pandas_dataframe()
match_weight | match_probability | unique_id_l | unique_id_r | postcode_l | postcode_r | gamma_postcode | first_name_l | first_name_r | gamma_first_name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -0.287568 | 0.450333 | 1 | 2 | [A, B] | [B] | 1 | John | John | 1 |
OtherΒΆ
Using DuckDB without pandasΒΆ
In this example, we read data directly using DuckDB and obtain results in native DuckDB DuckDBPyRelation
format.
import duckdb
import tempfile
import os
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
# Create a parquet file on disk to demontrate native DuckDB parquet reading
df = splink_datasets.fake_1000
temp_file = tempfile.NamedTemporaryFile(delete=True, suffix=".parquet")
temp_file_path = temp_file.name
df.to_parquet(temp_file_path)
# Example would start here if you already had a parquet file
duckdb_df = duckdb.read_parquet(temp_file_path)
db_api = DuckDBAPI(":default:")
settings = SettingsCreator(
link_type="dedupe_only",
comparisons=[
cl.NameComparison("first_name"),
cl.JaroAtThresholds("surname"),
],
blocking_rules_to_generate_predictions=[
block_on("first_name", "dob"),
block_on("surname"),
],
)
linker = Linker(df, settings, db_api, set_up_basic_logging=False)
result = linker.inference.predict().as_duckdbpyrelation()
# Since result is a DuckDBPyRelation, we can use all the usual DuckDB API
# functions on it.
# For example, we can use the `sort` function to sort the results,
# or could use result.to_parquet() to write to a parquet file.
result.sort("match_weight")
βββββββββββββββββββββββ¬βββββββββββββββββββββββ¬ββββββββββββββ¬ββββ¬ββββββββββββββββ¬βββββββββββββ¬βββββββββββββ¬ββββββββββββ
β match_weight β match_probability β unique_id_l β β¦ β gamma_surname β dob_l β dob_r β match_key β
β double β double β int64 β β int32 β varchar β varchar β varchar β
βββββββββββββββββββββββΌβββββββββββββββββββββββΌββββββββββββββΌββββΌββββββββββββββββΌβββββββββββββΌβββββββββββββΌββββββββββββ€
β -11.83278901894715 β 0.000274066864295451 β 758 β β¦ β 0 β 2002-09-15 β 2002-09-15 β 0 β
β -10.247826518225994 β 0.0008217501639050β¦ β 670 β β¦ β 0 β 2006-12-05 β 2006-12-05 β 0 β
β -9.662864017504837 β 0.0012321189988629β¦ β 558 β β¦ β 0 β 2020-02-11 β 2020-02-11 β 0 β
β -9.470218939562441 β 0.0014078881864458β¦ β 259 β β¦ β 1 β 1983-03-07 β 1983-03-07 β 0 β
β -8.470218939562441 β 0.002811817648042493 β 644 β β¦ β -1 β 1992-02-06 β 1992-02-06 β 0 β
β -8.287568102831404 β 0.0031901106569634β¦ β 393 β β¦ β 3 β 1991-05-06 β 1991-04-12 β 1 β
β -8.287568102831404 β 0.0031901106569634β¦ β 282 β β¦ β 3 β 2004-12-02 β 2002-02-25 β 1 β
β -8.287568102831404 β 0.0031901106569634β¦ β 282 β β¦ β 3 β 2004-12-02 β 1993-03-01 β 1 β
β -8.287568102831404 β 0.0031901106569634β¦ β 531 β β¦ β 3 β 1987-09-11 β 2000-09-03 β 1 β
β -8.287568102831404 β 0.0031901106569634β¦ β 531 β β¦ β 3 β 1987-09-11 β 1990-10-06 β 1 β
β Β· β Β· β Β· β Β· β Β· β Β· β Β· β Β· β
β Β· β Β· β Β· β Β· β Β· β Β· β Β· β Β· β
β Β· β Β· β Β· β Β· β Β· β Β· β Β· β Β· β
β 5.337135982495163 β 0.9758593366351407 β 554 β β¦ β 3 β 2020-02-11 β 2030-02-08 β 1 β
β 5.337135982495163 β 0.9758593366351407 β 774 β β¦ β 3 β 2027-04-21 β 2017-04-23 β 1 β
β 5.337135982495163 β 0.9758593366351407 β 874 β β¦ β 3 β 2020-06-23 β 2019-05-23 β 1 β
β 5.337135982495163 β 0.9758593366351407 β 409 β β¦ β 3 β 2017-05-03 β 2008-05-05 β 1 β
β 5.337135982495163 β 0.9758593366351407 β 415 β β¦ β 3 β 2002-02-25 β 1993-03-01 β 1 β
β 5.337135982495163 β 0.9758593366351407 β 740 β β¦ β 3 β 2005-09-18 β 2006-09-14 β 1 β
β 5.337135982495163 β 0.9758593366351407 β 417 β β¦ β 3 β 2002-02-24 β 1992-02-28 β 1 β
β 5.337135982495163 β 0.9758593366351407 β 534 β β¦ β 3 β 1974-02-28 β 1975-03-31 β 1 β
β 5.337135982495163 β 0.9758593366351407 β 286 β β¦ β 3 β 1985-01-05 β 1986-02-04 β 1 β
β 5.337135982495163 β 0.9758593366351407 β 172 β β¦ β 3 β 2012-07-06 β 2012-07-09 β 1 β
βββββββββββββββββββββββ΄βββββββββββββββββββββββ΄ββββββββββββββ΄ββββ΄ββββββββββββββββ΄βββββββββββββ΄βββββββββββββ΄ββββββββββββ€
β 1800 rows (20 shown) 13 columns (7 shown) β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Fixing m
or u
probabilities during trainingΒΆ
import splink.comparison_level_library as cll
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
db_api = DuckDBAPI()
first_name_comparison = cl.CustomComparison(
comparison_levels=[
cll.NullLevel("first_name"),
cll.ExactMatchLevel("first_name").configure(
m_probability=0.9999,
fix_m_probability=True,
u_probability=0.7,
fix_u_probability=True,
),
cll.ElseLevel(),
]
)
settings = SettingsCreator(
link_type="dedupe_only",
comparisons=[
first_name_comparison,
cl.ExactMatch("surname"),
cl.ExactMatch("dob"),
cl.ExactMatch("city"),
],
blocking_rules_to_generate_predictions=[
block_on("first_name"),
block_on("dob"),
],
additional_columns_to_retain=["cluster"],
)
df = splink_datasets.fake_1000
linker = Linker(df, settings, db_api, set_up_basic_logging=False)
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))
linker.visualisations.m_u_parameters_chart()
Manually altering m
and u
probabilities post-trainingΒΆ
This is not officially supported, but can be useful for ad-hoc alterations to trained models.
import splink.comparison_level_library as cll
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
from splink.datasets import splink_dataset_labels
labels = splink_dataset_labels.fake_1000_labels
db_api = DuckDBAPI()
settings = SettingsCreator(
link_type="dedupe_only",
comparisons=[
cl.ExactMatch("first_name"),
cl.ExactMatch("surname"),
cl.ExactMatch("dob"),
cl.ExactMatch("city"),
],
blocking_rules_to_generate_predictions=[
block_on("first_name"),
block_on("dob"),
],
)
df = splink_datasets.fake_1000
linker = Linker(df, settings, db_api, set_up_basic_logging=False)
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))
surname_comparison = linker._settings_obj._get_comparison_by_output_column_name(
"surname"
)
else_comparison_level = (
surname_comparison._get_comparison_level_by_comparison_vector_value(0)
)
else_comparison_level._m_probability = 0.1
linker.visualisations.m_u_parameters_chart()
Generate the (beta) labelling toolΒΆ
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
db_api = DuckDBAPI()
df = splink_datasets.fake_1000
settings = SettingsCreator(
link_type="dedupe_only",
comparisons=[
cl.ExactMatch("first_name"),
cl.ExactMatch("surname"),
cl.ExactMatch("dob"),
cl.ExactMatch("city").configure(term_frequency_adjustments=True),
cl.ExactMatch("email"),
],
blocking_rules_to_generate_predictions=[
block_on("first_name"),
block_on("surname"),
],
max_iterations=2,
)
linker = Linker(df, settings, db_api, set_up_basic_logging=False)
linker.training.estimate_probability_two_random_records_match(
[block_on("first_name", "surname")], recall=0.7
)
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))
pairwise_predictions = linker.inference.predict(threshold_match_weight=-10)
first_unique_id = df.iloc[0].unique_id
linker.evaluation.labelling_tool_for_specific_record(unique_id=first_unique_id, overwrite=True)
Modifying settings after loading from a serialised .json
modelΒΆ
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
# setup to create a model
db_api = DuckDBAPI()
df = splink_datasets.fake_1000
settings = SettingsCreator(
link_type="dedupe_only",
comparisons=[
cl.LevenshteinAtThresholds("first_name"),
cl.LevenshteinAtThresholds("surname"),
],
blocking_rules_to_generate_predictions=[
block_on("first_name", "dob"),
block_on("surname"),
]
)
linker = Linker(df, settings, db_api)
linker.misc.save_model_to_json("mod.json", overwrite=True)
new_settings = SettingsCreator.from_path_or_dict("mod.json")
new_settings.retain_intermediate_calculation_columns = True
new_settings.blocking_rules_to_generate_predictions = ["1=1"]
new_settings.additional_columns_to_retain = ["cluster"]
linker = Linker(df, new_settings, DuckDBAPI())
linker.inference.predict().as_duckdbpyrelation().show()
Using a DuckDB UDF in a comparison levelΒΆ
import difflib
import duckdb
import splink.comparison_level_library as cll
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
def custom_partial_ratio(s1, s2):
"""Custom function to compute partial ratio similarity between two strings."""
s1, s2 = str(s1), str(s2)
matcher = difflib.SequenceMatcher(None, s1, s2)
return matcher.ratio()
df = splink_datasets.fake_1000
con = duckdb.connect()
con.create_function(
"custom_partial_ratio",
custom_partial_ratio,
[duckdb.typing.VARCHAR, duckdb.typing.VARCHAR],
duckdb.typing.DOUBLE,
)
db_api = DuckDBAPI(connection=con)
fuzzy_email_comparison = {
"output_column_name": "email_fuzzy",
"comparison_levels": [
cll.NullLevel("email"),
cll.ExactMatchLevel("email"),
{
"sql_condition": "custom_partial_ratio(email_l, email_r) > 0.8",
"label_for_charts": "Fuzzy match (β₯ 0.8)",
},
cll.ElseLevel(),
],
}
settings = SettingsCreator(
link_type="dedupe_only",
comparisons=[
cl.ExactMatch("first_name"),
cl.ExactMatch("surname"),
cl.ExactMatch("dob"),
cl.ExactMatch("city").configure(term_frequency_adjustments=True),
fuzzy_email_comparison,
],
blocking_rules_to_generate_predictions=[
block_on("first_name"),
block_on("surname"),
],
max_iterations=2,
)
linker = Linker(df, settings, db_api)
linker.training.estimate_probability_two_random_records_match(
[block_on("first_name", "surname")], recall=0.7
)
linker.training.estimate_u_using_random_sampling(max_pairs=1e5)
linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))
pairwise_predictions = linker.inference.predict(threshold_match_weight=-10)