Skip to content

Estimating m probabilities from labels

Estimating m from a sample of pairwise labelsΒΆ

In this example, we estimate the m probabilities of the model from a table containing pairwise record comparisons which we know are 'true' matches. For example, these may be the result of work by a clerical team who have manually labelled a sample of matches.

The table must be in the following format:

source_dataset_l unique_id_l source_dataset_r unique_id_r
df_1 1 df_2 2
df_1 1 df_2 3

It is assumed that every record in the table represents a certain match.

Note that the column names above are the defaults. They should correspond to the values you've set for unique_id_column_name and source_dataset_column_name, if you've chosen custom values.

import pandas as pd 
import altair as alt

from splink.datasets import splink_dataset_labels
pairwise_labels = splink_dataset_labels.fake_1000_labels

# Choose labels indicating a match
pairwise_labels = pairwise_labels[pairwise_labels["clerical_match_score"] == 1]
pairwise_labels
unique_id_l source_dataset_l unique_id_r source_dataset_r clerical_match_score
0 0 fake_1000 1 fake_1000 1.0
1 0 fake_1000 2 fake_1000 1.0
2 0 fake_1000 3 fake_1000 1.0
49 1 fake_1000 2 fake_1000 1.0
50 1 fake_1000 3 fake_1000 1.0
... ... ... ... ... ...
3171 994 fake_1000 996 fake_1000 1.0
3172 995 fake_1000 996 fake_1000 1.0
3173 997 fake_1000 998 fake_1000 1.0
3174 997 fake_1000 999 fake_1000 1.0
3175 998 fake_1000 999 fake_1000 1.0

2031 rows Γ— 5 columns

We now proceed to estimate the Fellegi Sunter model:

from splink.datasets import splink_datasets

df = splink_datasets.fake_1000
df.head(2)
unique_id first_name surname dob city email cluster
0 0 Robert Alan 1971-06-24 NaN robert255@smith.net 0
1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net 0
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        block_on("surname"),
    ],
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True),
        cl.exact_match("city", term_frequency_adjustments=True),
        ctl.email_comparison("email", include_username_fuzzy_level=False),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    "l.email = r.email"
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)
linker.estimate_u_using_random_sampling(max_pairs=1e6)
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
# Register the pairwise labels table with the database, and then use it to estimate the m values
labels_df = linker.register_labels_table(pairwise_labels, overwrite=True)
linker.estimate_m_from_pairwise_labels(labels_df)


# If the labels table already existing in the dataset you could run
# linker.estimate_m_from_pairwise_labels("labels_tablename_here")
training_blocking_rule = block_on("first_name")
linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
<EMTrainingSession, blocking on l."first_name" = r."first_name", deactivating comparisons first_name>
linker.parameter_estimate_comparisons_chart()
linker.match_weights_chart()