Skip to content

Evaluation from ground truth column

Evaluation when you have fully labelled data¶

In this example, our data contains a fully-populated ground-truth column called cluster that enables us to perform accuracy analysis of the final model

from splink.datasets import splink_datasets
import altair as alt
alt.renderers.enable("html")

df = splink_datasets.fake_1000

df.head(2)
unique_id first_name surname dob city email cluster
0 0 Robert Alan 1971-06-24 NaN robert255@smith.net 0
1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net 0
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import splink.duckdb.comparison_template_library as ctl
import splink.duckdb.comparison_library as cl

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        block_on("surname"),
    ],
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True),
        cl.exact_match("city", term_frequency_adjustments=True),
        ctl.email_comparison("email", include_username_fuzzy_level=False),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    "l.email = r.email"
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)
linker.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)
session_dob = linker.estimate_parameters_using_expectation_maximisation(block_on("dob"))
session_email = linker.estimate_parameters_using_expectation_maximisation(block_on("email"))
linker.truth_space_table_from_labels_column(
    "cluster", match_weight_round_to_nearest=0.1
).as_pandas_dataframe(limit=5)
truth_threshold match_probability row_count p n tp tn fp fn P_rate ... precision recall specificity npv accuracy f1 f2 f0_5 p4 phi
0 -24.3 4.841400e-08 4353.0 2031.0 2322.0 2031.0 0.0 2322.0 0.0 0.466575 ... 0.466575 1.000000 0.000000 1.000000 0.466575 0.636278 0.813898 0.522296 0.000000 0.000000
1 -23.8 6.846774e-08 4353.0 2031.0 2322.0 2030.0 0.0 2322.0 1.0 0.466575 ... 0.466452 0.999508 0.000000 0.000000 0.466345 0.636065 0.813562 0.522146 0.000000 -0.016208
2 -23.7 7.338190e-08 4353.0 2031.0 2322.0 2030.0 234.0 2088.0 1.0 0.466575 ... 0.492958 0.999508 0.100775 0.995745 0.520101 0.660270 0.829113 0.548560 0.286607 0.221379
3 -22.6 1.572975e-07 4353.0 2031.0 2322.0 2030.0 428.0 1894.0 1.0 0.466575 ... 0.517329 0.999508 0.184324 0.997669 0.564668 0.681780 0.842463 0.572573 0.427302 0.307690
4 -22.5 1.685873e-07 4353.0 2031.0 2322.0 2030.0 583.0 1739.0 1.0 0.466575 ... 0.538604 0.999508 0.251077 0.998288 0.600276 0.700000 0.853443 0.593324 0.510093 0.366792

5 rows × 25 columns

linker.roc_chart_from_labels_column("cluster")
linker.precision_recall_chart_from_labels_column("cluster")
# Plot some false positives
linker.prediction_errors_from_labels_column(
    "cluster", include_false_negatives=True, include_false_positives=True
).as_pandas_dataframe(limit=5)
clerical_match_score found_by_blocking_rules match_weight match_probability unique_id_l unique_id_r first_name_l first_name_r gamma_first_name bf_first_name ... tf_city_r bf_city bf_tf_adj_city email_l email_r gamma_email bf_email cluster_l cluster_r match_key
0 1.0 False -24.284246 4.894558e-08 417 418 Florence Brown 0 0.216572 ... 0.00123 0.429162 1.0 fb@reose.cem f@b@reese.com 0 0.001067 108 108 2
1 1.0 False -22.077164 2.260015e-07 796 797 Taylor NaN -1 1.000000 ... 0.00738 0.429162 1.0 jt40o@combs.net jt40@cotbs.nm 0 0.001067 201 201 2
2 1.0 False -19.750689 1.133573e-06 452 454 NaN Davies -1 1.000000 ... 0.01599 0.429162 1.0 rd@lewis.com idlewrs.cocm 0 0.001067 115 115 2
3 1.0 True -15.659150 1.932492e-05 594 595 Grace Grace 4 85.509553 ... 0.00123 0.429162 1.0 gk@frey-robinson.org rgk@frey-robinon.org 0 0.001067 146 146 0
4 1.0 False -14.411473 4.588751e-05 150 151 Alfie Kelly 0 0.216572 ... 0.04920 0.429162 1.0 alfiekelly@walters.com NaN -1 1.000000 40 40 2

5 rows × 32 columns

records = linker.prediction_errors_from_labels_column(
    "cluster", include_false_negatives=True, include_false_positives=True
).as_record_dict(limit=5)

linker.waterfall_chart(records)