Evaluation from ground truth column
Evaluation when you have fully labelled data¶
In this example, our data contains a fully-populated ground-truth column called cluster
that enables us to perform accuracy analysis of the final model
from splink.datasets import splink_datasets
import altair as alt
alt.renderers.enable("html")
df = splink_datasets.fake_1000
df.head(2)
unique_id | first_name | surname | dob | city | cluster | ||
---|---|---|---|---|---|---|---|
0 | 0 | Robert | Alan | 1971-06-24 | NaN | robert255@smith.net | 0 |
1 | 1 | Robert | Allen | 1971-05-24 | NaN | roberta25@smith.net | 0 |
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import splink.duckdb.comparison_template_library as ctl
import splink.duckdb.comparison_library as cl
settings = {
"link_type": "dedupe_only",
"blocking_rules_to_generate_predictions": [
block_on("first_name"),
block_on("surname"),
],
"comparisons": [
ctl.name_comparison("first_name"),
ctl.name_comparison("surname"),
ctl.date_comparison("dob", cast_strings_to_date=True),
cl.exact_match("city", term_frequency_adjustments=True),
ctl.email_comparison("email", include_username_fuzzy_level=False),
],
"retain_matching_columns": True,
"retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
"l.email = r.email"
]
linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)
linker.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)
session_dob = linker.estimate_parameters_using_expectation_maximisation(block_on("dob"))
session_email = linker.estimate_parameters_using_expectation_maximisation(block_on("email"))
linker.truth_space_table_from_labels_column(
"cluster", match_weight_round_to_nearest=0.1
).as_pandas_dataframe(limit=5)
truth_threshold | match_probability | row_count | p | n | tp | tn | fp | fn | P_rate | ... | precision | recall | specificity | npv | accuracy | f1 | f2 | f0_5 | p4 | phi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -24.3 | 4.841400e-08 | 4353.0 | 2031.0 | 2322.0 | 2031.0 | 0.0 | 2322.0 | 0.0 | 0.466575 | ... | 0.466575 | 1.000000 | 0.000000 | 1.000000 | 0.466575 | 0.636278 | 0.813898 | 0.522296 | 0.000000 | 0.000000 |
1 | -23.8 | 6.846774e-08 | 4353.0 | 2031.0 | 2322.0 | 2030.0 | 0.0 | 2322.0 | 1.0 | 0.466575 | ... | 0.466452 | 0.999508 | 0.000000 | 0.000000 | 0.466345 | 0.636065 | 0.813562 | 0.522146 | 0.000000 | -0.016208 |
2 | -23.7 | 7.338190e-08 | 4353.0 | 2031.0 | 2322.0 | 2030.0 | 234.0 | 2088.0 | 1.0 | 0.466575 | ... | 0.492958 | 0.999508 | 0.100775 | 0.995745 | 0.520101 | 0.660270 | 0.829113 | 0.548560 | 0.286607 | 0.221379 |
3 | -22.6 | 1.572975e-07 | 4353.0 | 2031.0 | 2322.0 | 2030.0 | 428.0 | 1894.0 | 1.0 | 0.466575 | ... | 0.517329 | 0.999508 | 0.184324 | 0.997669 | 0.564668 | 0.681780 | 0.842463 | 0.572573 | 0.427302 | 0.307690 |
4 | -22.5 | 1.685873e-07 | 4353.0 | 2031.0 | 2322.0 | 2030.0 | 583.0 | 1739.0 | 1.0 | 0.466575 | ... | 0.538604 | 0.999508 | 0.251077 | 0.998288 | 0.600276 | 0.700000 | 0.853443 | 0.593324 | 0.510093 | 0.366792 |
5 rows × 25 columns
linker.roc_chart_from_labels_column("cluster")
linker.precision_recall_chart_from_labels_column("cluster")
# Plot some false positives
linker.prediction_errors_from_labels_column(
"cluster", include_false_negatives=True, include_false_positives=True
).as_pandas_dataframe(limit=5)
clerical_match_score | found_by_blocking_rules | match_weight | match_probability | unique_id_l | unique_id_r | first_name_l | first_name_r | gamma_first_name | bf_first_name | ... | tf_city_r | bf_city | bf_tf_adj_city | email_l | email_r | gamma_email | bf_email | cluster_l | cluster_r | match_key | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | False | -24.284246 | 4.894558e-08 | 417 | 418 | Florence | Brown | 0 | 0.216572 | ... | 0.00123 | 0.429162 | 1.0 | fb@reose.cem | f@b@reese.com | 0 | 0.001067 | 108 | 108 | 2 |
1 | 1.0 | False | -22.077164 | 2.260015e-07 | 796 | 797 | Taylor | NaN | -1 | 1.000000 | ... | 0.00738 | 0.429162 | 1.0 | jt40o@combs.net | jt40@cotbs.nm | 0 | 0.001067 | 201 | 201 | 2 |
2 | 1.0 | False | -19.750689 | 1.133573e-06 | 452 | 454 | NaN | Davies | -1 | 1.000000 | ... | 0.01599 | 0.429162 | 1.0 | rd@lewis.com | idlewrs.cocm | 0 | 0.001067 | 115 | 115 | 2 |
3 | 1.0 | True | -15.659150 | 1.932492e-05 | 594 | 595 | Grace | Grace | 4 | 85.509553 | ... | 0.00123 | 0.429162 | 1.0 | gk@frey-robinson.org | rgk@frey-robinon.org | 0 | 0.001067 | 146 | 146 | 0 |
4 | 1.0 | False | -14.411473 | 4.588751e-05 | 150 | 151 | Alfie | Kelly | 0 | 0.216572 | ... | 0.04920 | 0.429162 | 1.0 | alfiekelly@walters.com | NaN | -1 | 1.000000 | 40 | 40 | 2 |
5 rows × 32 columns
records = linker.prediction_errors_from_labels_column(
"cluster", include_false_negatives=True, include_false_positives=True
).as_record_dict(limit=5)
linker.waterfall_chart(records)