Estimating m probabilities from labels

Estimating m from a sample of pairwise labels¶

In this example, we estimate the m probabilities of the model from a table containing pairwise record comparisons which we know are 'true' matches. For example, these may be the result of work by a clerical team who have manually labelled a sample of matches.

The table must be in the following format:

source_dataset_l	unique_id_l	source_dataset_r	unique_id_r
df_1	1	df_2	2
df_1	1	df_2	3

It is assumed that every record in the table represents a certain match.

Note that the column names above are the defaults. They should correspond to the values you've set for unique_id_column_name and source_dataset_column_name, if you've chosen custom values.

from splink.datasets import splink_dataset_labels

pairwise_labels = splink_dataset_labels.fake_1000_labels

# Choose labels indicating a match
pairwise_labels = pairwise_labels[pairwise_labels["clerical_match_score"] == 1]
pairwise_labels

	unique_id_l	source_dataset_l	unique_id_r	source_dataset_r	clerical_match_score
0	0	fake_1000	1	fake_1000	1.0
1	0	fake_1000	2	fake_1000	1.0
2	0	fake_1000	3	fake_1000	1.0
49	1	fake_1000	2	fake_1000	1.0
50	1	fake_1000	3	fake_1000	1.0
...	...	...	...	...	...
3171	994	fake_1000	996	fake_1000	1.0
3172	995	fake_1000	996	fake_1000	1.0
3173	997	fake_1000	998	fake_1000	1.0
3174	997	fake_1000	999	fake_1000	1.0
3175	998	fake_1000	999	fake_1000	1.0

2031 rows × 5 columns

We now proceed to estimate the Fellegi Sunter model:

from splink import splink_datasets

df = splink_datasets.fake_1000
df.head(2)

	unique_id	first_name	surname	dob	city	email	cluster
0	0	Robert	Alan	1971-06-24	NaN	robert255@smith.net	0
1	1	Robert	Allen	1971-05-24	NaN	roberta25@smith.net	0

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on

settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=[
        block_on("first_name"),
        block_on("surname"),
    ],
    comparisons=[
        cl.NameComparison("first_name"),
        cl.NameComparison("surname"),
        cl.DateOfBirthComparison(
            "dob",
            input_is_string=True,
        ),
        cl.ExactMatch("city").configure(term_frequency_adjustments=True),
        cl.EmailComparison("email"),
    ],
    retain_intermediate_calculation_columns=True,
)

linker = Linker(df, settings, db_api=DuckDBAPI(), set_up_basic_logging=False)
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    "l.email = r.email",
]

linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)

linker.training.estimate_u_using_random_sampling(max_pairs=1e6)

You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.

# Register the pairwise labels table with the database, and then use it to estimate the m values
labels_df = linker.table_management.register_labels_table(pairwise_labels, overwrite=True)
linker.training.estimate_m_from_pairwise_labels(labels_df)


# If the labels table already existing in the dataset you could run
# linker.training.estimate_m_from_pairwise_labels("labels_tablename_here")

training_blocking_rule = block_on("first_name")
linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)

<EMTrainingSession, blocking on l."first_name" = r."first_name", deactivating comparisons first_name>

linker.visualisations.parameter_estimate_comparisons_chart()

linker.visualisations.match_weights_chart()