Skip to content

CookbookΒΆ

This notebook contains a miscellaneous collection of runnable examples illustrating various Splink techniques.

Array columnsΒΆ

Comparing array columnsΒΆ

This example shows how we can use use ArrayIntersectAtSizes to assess the similarity of columns containing arrays.

import pandas as pd

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on


data = [
    {"unique_id": 1, "first_name": "John", "postcode": ["A", "B"]},
    {"unique_id": 2, "first_name": "John", "postcode": ["B"]},
    {"unique_id": 3, "first_name": "John", "postcode": ["A"]},
    {"unique_id": 4, "first_name": "John", "postcode": ["A", "B"]},
    {"unique_id": 5, "first_name": "John", "postcode": ["C"]},
]

df = pd.DataFrame(data)

settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=[
        block_on("first_name"),
    ],
    comparisons=[
        cl.ArrayIntersectAtSizes("postcode", [2, 1]),
        cl.ExactMatch("first_name"),
    ]
)


linker = Linker(df, settings, DuckDBAPI(), set_up_basic_logging=False)

linker.inference.predict().as_pandas_dataframe()
match_weight match_probability unique_id_l unique_id_r postcode_l postcode_r gamma_postcode first_name_l first_name_r gamma_first_name
0 -8.287568 0.003190 4 5 [A, B] [C] 0 John John 1
1 -0.287568 0.450333 3 4 [A] [A, B] 1 John John 1
2 -8.287568 0.003190 3 5 [A] [C] 0 John John 1
3 -8.287568 0.003190 2 3 [B] [A] 0 John John 1
4 -0.287568 0.450333 2 4 [B] [A, B] 1 John John 1
5 -8.287568 0.003190 2 5 [B] [C] 0 John John 1
6 -0.287568 0.450333 1 2 [A, B] [B] 1 John John 1
7 -0.287568 0.450333 1 3 [A, B] [A] 1 John John 1
8 6.712432 0.990554 1 4 [A, B] [A, B] 2 John John 1
9 -8.287568 0.003190 1 5 [A, B] [C] 0 John John 1

Blocking on array columnsΒΆ

This example shows how we can use block_on to block on the individual elements of an array column - that is, pairwise comaprisons are created for pairs or records where any of the elements in the array columns match.

import pandas as pd

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on


data = [
    {"unique_id": 1, "first_name": "John", "postcode": ["A", "B"]},
    {"unique_id": 2, "first_name": "John", "postcode": ["B"]},
    {"unique_id": 3, "first_name": "John", "postcode": ["C"]},

]

df = pd.DataFrame(data)

settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=[
        block_on("postcode", arrays_to_explode=["postcode"]),
    ],
    comparisons=[
        cl.ArrayIntersectAtSizes("postcode", [2, 1]),
        cl.ExactMatch("first_name"),
    ]
)


linker = Linker(df, settings, DuckDBAPI(), set_up_basic_logging=False)

linker.inference.predict().as_pandas_dataframe()
match_weight match_probability unique_id_l unique_id_r postcode_l postcode_r gamma_postcode first_name_l first_name_r gamma_first_name
0 -0.287568 0.450333 1 2 [A, B] [B] 1 John John 1

OtherΒΆ

Using DuckDB without pandasΒΆ

In this example, we read data directly using DuckDB and obtain results in native DuckDB DuckDBPyRelation format.

import duckdb
import tempfile
import os

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets

# Create a parquet file on disk to demontrate native DuckDB parquet reading
df = splink_datasets.fake_1000
temp_file = tempfile.NamedTemporaryFile(delete=True, suffix=".parquet")
temp_file_path = temp_file.name
df.to_parquet(temp_file_path)

# Example would start here if you already had a parquet file
duckdb_df = duckdb.read_parquet(temp_file_path)

db_api = DuckDBAPI(":default:")
settings = SettingsCreator(
    link_type="dedupe_only",
    comparisons=[
        cl.NameComparison("first_name"),
        cl.JaroAtThresholds("surname"),
    ],
    blocking_rules_to_generate_predictions=[
        block_on("first_name", "dob"),
        block_on("surname"),
    ],
)

linker = Linker(df, settings, db_api, set_up_basic_logging=False)

result = linker.inference.predict().as_duckdbpyrelation()

# Since result is a DuckDBPyRelation, we can use all the usual DuckDB API
# functions on it.

# For example, we can use the `sort` function to sort the results,
# or could use result.to_parquet() to write to a parquet file.
result.sort("match_weight")
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚    match_weight     β”‚  match_probability   β”‚ unique_id_l β”‚ … β”‚ gamma_surname β”‚   dob_l    β”‚   dob_r    β”‚ match_key β”‚
β”‚       double        β”‚        double        β”‚    int64    β”‚   β”‚     int32     β”‚  varchar   β”‚  varchar   β”‚  varchar  β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚  -11.83278901894715 β”‚ 0.000274066864295451 β”‚         758 β”‚ … β”‚             0 β”‚ 2002-09-15 β”‚ 2002-09-15 β”‚ 0         β”‚
β”‚ -10.247826518225994 β”‚  0.0008217501639050… β”‚         670 β”‚ … β”‚             0 β”‚ 2006-12-05 β”‚ 2006-12-05 β”‚ 0         β”‚
β”‚  -9.662864017504837 β”‚  0.0012321189988629… β”‚         558 β”‚ … β”‚             0 β”‚ 2020-02-11 β”‚ 2020-02-11 β”‚ 0         β”‚
β”‚  -9.470218939562441 β”‚  0.0014078881864458… β”‚         259 β”‚ … β”‚             1 β”‚ 1983-03-07 β”‚ 1983-03-07 β”‚ 0         β”‚
β”‚  -8.470218939562441 β”‚ 0.002811817648042493 β”‚         644 β”‚ … β”‚            -1 β”‚ 1992-02-06 β”‚ 1992-02-06 β”‚ 0         β”‚
β”‚  -8.287568102831404 β”‚  0.0031901106569634… β”‚         393 β”‚ … β”‚             3 β”‚ 1991-05-06 β”‚ 1991-04-12 β”‚ 1         β”‚
β”‚  -8.287568102831404 β”‚  0.0031901106569634… β”‚         282 β”‚ … β”‚             3 β”‚ 2004-12-02 β”‚ 2002-02-25 β”‚ 1         β”‚
β”‚  -8.287568102831404 β”‚  0.0031901106569634… β”‚         282 β”‚ … β”‚             3 β”‚ 2004-12-02 β”‚ 1993-03-01 β”‚ 1         β”‚
β”‚  -8.287568102831404 β”‚  0.0031901106569634… β”‚         531 β”‚ … β”‚             3 β”‚ 1987-09-11 β”‚ 2000-09-03 β”‚ 1         β”‚
β”‚  -8.287568102831404 β”‚  0.0031901106569634… β”‚         531 β”‚ … β”‚             3 β”‚ 1987-09-11 β”‚ 1990-10-06 β”‚ 1         β”‚
β”‚           Β·         β”‚            Β·         β”‚          Β·  β”‚ Β· β”‚             Β· β”‚     Β·      β”‚     Β·      β”‚ Β·         β”‚
β”‚           Β·         β”‚            Β·         β”‚          Β·  β”‚ Β· β”‚             Β· β”‚     Β·      β”‚     Β·      β”‚ Β·         β”‚
β”‚           Β·         β”‚            Β·         β”‚          Β·  β”‚ Β· β”‚             Β· β”‚     Β·      β”‚     Β·      β”‚ Β·         β”‚
β”‚   5.337135982495163 β”‚   0.9758593366351407 β”‚         554 β”‚ … β”‚             3 β”‚ 2020-02-11 β”‚ 2030-02-08 β”‚ 1         β”‚
β”‚   5.337135982495163 β”‚   0.9758593366351407 β”‚         774 β”‚ … β”‚             3 β”‚ 2027-04-21 β”‚ 2017-04-23 β”‚ 1         β”‚
β”‚   5.337135982495163 β”‚   0.9758593366351407 β”‚         874 β”‚ … β”‚             3 β”‚ 2020-06-23 β”‚ 2019-05-23 β”‚ 1         β”‚
β”‚   5.337135982495163 β”‚   0.9758593366351407 β”‚         409 β”‚ … β”‚             3 β”‚ 2017-05-03 β”‚ 2008-05-05 β”‚ 1         β”‚
β”‚   5.337135982495163 β”‚   0.9758593366351407 β”‚         415 β”‚ … β”‚             3 β”‚ 2002-02-25 β”‚ 1993-03-01 β”‚ 1         β”‚
β”‚   5.337135982495163 β”‚   0.9758593366351407 β”‚         740 β”‚ … β”‚             3 β”‚ 2005-09-18 β”‚ 2006-09-14 β”‚ 1         β”‚
β”‚   5.337135982495163 β”‚   0.9758593366351407 β”‚         417 β”‚ … β”‚             3 β”‚ 2002-02-24 β”‚ 1992-02-28 β”‚ 1         β”‚
β”‚   5.337135982495163 β”‚   0.9758593366351407 β”‚         534 β”‚ … β”‚             3 β”‚ 1974-02-28 β”‚ 1975-03-31 β”‚ 1         β”‚
β”‚   5.337135982495163 β”‚   0.9758593366351407 β”‚         286 β”‚ … β”‚             3 β”‚ 1985-01-05 β”‚ 1986-02-04 β”‚ 1         β”‚
β”‚   5.337135982495163 β”‚   0.9758593366351407 β”‚         172 β”‚ … β”‚             3 β”‚ 2012-07-06 β”‚ 2012-07-09 β”‚ 1         β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ 1800 rows (20 shown)                                                                          13 columns (7 shown) β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

Fixing m or u probabilities during trainingΒΆ

import splink.comparison_level_library as cll
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets


db_api = DuckDBAPI()

first_name_comparison = cl.CustomComparison(
    comparison_levels=[
        cll.NullLevel("first_name"),
        cll.ExactMatchLevel("first_name").configure(
            m_probability=0.9999,
            fix_m_probability=True,
            u_probability=0.7,
            fix_u_probability=True,
        ),
        cll.ElseLevel(),
    ]
)
settings = SettingsCreator(
    link_type="dedupe_only",
    comparisons=[
        first_name_comparison,
        cl.ExactMatch("surname"),
        cl.ExactMatch("dob"),
        cl.ExactMatch("city"),
    ],
    blocking_rules_to_generate_predictions=[
        block_on("first_name"),
        block_on("dob"),
    ],
    additional_columns_to_retain=["cluster"],
)

df = splink_datasets.fake_1000
linker = Linker(df, settings, db_api, set_up_basic_logging=False)

linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))

linker.visualisations.m_u_parameters_chart()

Manually altering m and u probabilities post-trainingΒΆ

This is not officially supported, but can be useful for ad-hoc alterations to trained models.

import splink.comparison_level_library as cll
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
from splink.datasets import splink_dataset_labels

labels = splink_dataset_labels.fake_1000_labels

db_api = DuckDBAPI()


settings = SettingsCreator(
    link_type="dedupe_only",
    comparisons=[
        cl.ExactMatch("first_name"),
        cl.ExactMatch("surname"),
        cl.ExactMatch("dob"),
        cl.ExactMatch("city"),
    ],
    blocking_rules_to_generate_predictions=[
        block_on("first_name"),
        block_on("dob"),
    ],
)
df = splink_datasets.fake_1000
linker = Linker(df, settings, db_api, set_up_basic_logging=False)

linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))


surname_comparison = linker._settings_obj._get_comparison_by_output_column_name(
    "surname"
)
else_comparison_level = (
    surname_comparison._get_comparison_level_by_comparison_vector_value(0)
)
else_comparison_level._m_probability = 0.1


linker.visualisations.m_u_parameters_chart()

Generate the (beta) labelling toolΒΆ

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets

db_api = DuckDBAPI()

df = splink_datasets.fake_1000

settings = SettingsCreator(
    link_type="dedupe_only",
    comparisons=[
        cl.ExactMatch("first_name"),
        cl.ExactMatch("surname"),
        cl.ExactMatch("dob"),
        cl.ExactMatch("city").configure(term_frequency_adjustments=True),
        cl.ExactMatch("email"),
    ],
    blocking_rules_to_generate_predictions=[
        block_on("first_name"),
        block_on("surname"),
    ],
    max_iterations=2,
)

linker = Linker(df, settings, db_api, set_up_basic_logging=False)

linker.training.estimate_probability_two_random_records_match(
    [block_on("first_name", "surname")], recall=0.7
)

linker.training.estimate_u_using_random_sampling(max_pairs=1e6)

linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))

pairwise_predictions = linker.inference.predict(threshold_match_weight=-10)

first_unique_id = df.iloc[0].unique_id
linker.evaluation.labelling_tool_for_specific_record(unique_id=first_unique_id, overwrite=True)