Evaluation from ground truth column

Evaluation when you have fully labelled data¶

In this example, our data contains a fully-populated ground-truth column called cluster that enables us to perform accuracy analysis of the final model

from splink.datasets import splink_datasets
import altair as alt
alt.renderers.enable("html")

df = splink_datasets.fake_1000

df.head(2)

	unique_id	first_name	surname	dob	city	email	cluster
0	0	Robert	Alan	1971-06-24	NaN	robert255@smith.net	0
1	1	Robert	Allen	1971-05-24	NaN	roberta25@smith.net	0

from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import splink.duckdb.comparison_template_library as ctl
import splink.duckdb.comparison_library as cl

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        block_on("surname"),
    ],
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True),
        cl.exact_match("city", term_frequency_adjustments=True),
        ctl.email_comparison("email", include_username_fuzzy_level=False),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}

linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    "l.email = r.email"
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)

linker.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)

session_dob = linker.estimate_parameters_using_expectation_maximisation(block_on("dob"))
session_email = linker.estimate_parameters_using_expectation_maximisation(block_on("email"))

linker.truth_space_table_from_labels_column(
    "cluster", match_weight_round_to_nearest=0.1
).as_pandas_dataframe(limit=5)

	truth_threshold	match_probability	row_count	p	n	tp	tn	fp	fn	P_rate	...	precision	recall	specificity	npv	accuracy	f1	f2	f0_5	p4	phi
0	-24.3	4.841400e-08	4353.0	2031.0	2322.0	2031.0	0.0	2322.0	0.0	0.466575	...	0.466575	1.000000	0.000000	1.000000	0.466575	0.636278	0.813898	0.522296	0.000000	0.000000
1	-23.8	6.846774e-08	4353.0	2031.0	2322.0	2030.0	0.0	2322.0	1.0	0.466575	...	0.466452	0.999508	0.000000	0.000000	0.466345	0.636065	0.813562	0.522146	0.000000	-0.016208
2	-23.7	7.338190e-08	4353.0	2031.0	2322.0	2030.0	234.0	2088.0	1.0	0.466575	...	0.492958	0.999508	0.100775	0.995745	0.520101	0.660270	0.829113	0.548560	0.286607	0.221379
3	-22.6	1.572975e-07	4353.0	2031.0	2322.0	2030.0	428.0	1894.0	1.0	0.466575	...	0.517329	0.999508	0.184324	0.997669	0.564668	0.681780	0.842463	0.572573	0.427302	0.307690
4	-22.5	1.685873e-07	4353.0	2031.0	2322.0	2030.0	583.0	1739.0	1.0	0.466575	...	0.538604	0.999508	0.251077	0.998288	0.600276	0.700000	0.853443	0.593324	0.510093	0.366792

5 rows × 25 columns

linker.roc_chart_from_labels_column("cluster")

linker.precision_recall_chart_from_labels_column("cluster")

# Plot some false positives
linker.prediction_errors_from_labels_column(
    "cluster", include_false_negatives=True, include_false_positives=True
).as_pandas_dataframe(limit=5)

	clerical_match_score	found_by_blocking_rules	match_weight	match_probability	unique_id_l	unique_id_r	first_name_l	first_name_r	gamma_first_name	bf_first_name	...	tf_city_r	bf_city	bf_tf_adj_city	email_l	email_r	gamma_email	bf_email	cluster_l	cluster_r	match_key
0	1.0	False	-24.284246	4.894558e-08	417	418	Florence	Brown	0	0.216572	...	0.00123	0.429162	1.0	fb@reose.cem	f@b@reese.com	0	0.001067	108	108	2
1	1.0	False	-22.077164	2.260015e-07	796	797	Taylor	NaN	-1	1.000000	...	0.00738	0.429162	1.0	jt40o@combs.net	jt40@cotbs.nm	0	0.001067	201	201	2
2	1.0	False	-19.750689	1.133573e-06	452	454	NaN	Davies	-1	1.000000	...	0.01599	0.429162	1.0	rd@lewis.com	idlewrs.cocm	0	0.001067	115	115	2
3	1.0	True	-15.659150	1.932492e-05	594	595	Grace	Grace	4	85.509553	...	0.00123	0.429162	1.0	gk@frey-robinson.org	rgk@frey-robinon.org	0	0.001067	146	146	0
4	1.0	False	-14.411473	4.588751e-05	150	151	Alfie	Kelly	0	0.216572	...	0.04920	0.429162	1.0	alfiekelly@walters.com	NaN	-1	1.000000	40	40	2

5 rows × 32 columns

records = linker.prediction_errors_from_labels_column(
    "cluster", include_false_negatives=True, include_false_positives=True
).as_record_dict(limit=5)

linker.waterfall_chart(records)