Skip to content

Deduplicate 50k rows historical persons

Linking a dataset of real historical persons¶

In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.

Open In Colab

from splink import splink_datasets

df = splink_datasets.historical_50k
df.head()
unique_id cluster full_name first_and_surname first_name surname dob birth_place postcode_fake gender occupation
0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh thomas chudleigh thomas chudleigh 1630-08-01 devon tq13 8df male politician
1 Q2296770-2 Q2296770 thomas of chudleigh thomas chudleigh thomas chudleigh 1630-08-01 devon tq13 8df male politician
2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh tom chudleigh tom chudleigh 1630-08-01 devon tq13 8df male politician
3 Q2296770-4 Q2296770 thomas 1st chudleigh thomas chudleigh thomas chudleigh 1630-08-01 devon tq13 8hu None politician
4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh thomas chudleigh thomas chudleigh 1630-08-01 devon tq13 8df None politician
from splink import DuckDBAPI
from splink.exploratory import profile_columns

db_api = DuckDBAPI()
profile_columns(df, db_api, column_expressions=["first_name", "substr(surname,1,2)"])
from splink import DuckDBAPI, block_on
from splink.blocking_analysis import (
    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)

blocking_rules = [
    block_on("substr(first_name,1,3)", "substr(surname,1,4)"),
    block_on("surname", "dob"),
    block_on("first_name", "dob"),
    block_on("postcode_fake", "first_name"),
    block_on("postcode_fake", "surname"),
    block_on("dob", "birth_place"),
    block_on("substr(postcode_fake,1,3)", "dob"),
    block_on("substr(postcode_fake,1,3)", "first_name"),
    block_on("substr(postcode_fake,1,3)", "surname"),
    block_on("substr(first_name,1,2)", "substr(surname,1,2)", "substr(dob,1,4)"),
]

db_api = DuckDBAPI()

cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
    table_or_tables=df,
    blocking_rules=blocking_rules,
    db_api=db_api,
    link_type="dedupe_only",
)
import splink.comparison_library as cl

from splink import Linker, SettingsCreator

settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=blocking_rules,
    comparisons=[
        cl.ForenameSurnameComparison(
            "first_name",
            "surname",
            forename_surname_concat_col_name="first_name_surname_concat",
        ),
        cl.DateOfBirthComparison(
            "dob", input_is_string=True
        ),
        cl.PostcodeComparison("postcode_fake"),
        cl.ExactMatch("birth_place").configure(term_frequency_adjustments=True),
        cl.ExactMatch("occupation").configure(term_frequency_adjustments=True),
    ],
    retain_intermediate_calculation_columns=True,
)
# Needed to apply term frequencies to first+surname comparison
df["first_name_surname_concat"] = df["first_name"] + " " + df["surname"]
linker = Linker(df, settings, db_api=db_api)
linker.training.estimate_probability_two_random_records_match(
    [
        "l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob",
        "substr(l.first_name,1,2) = substr(r.first_name,1,2) and l.surname = r.surname and substr(l.postcode_fake,1,2) = substr(r.postcode_fake,1,2)",
        "l.dob = r.dob and l.postcode_fake = r.postcode_fake",
    ],
    recall=0.6,
)
Probability two random records match is estimated to be  0.000136.
This means that amongst all possible pairwise record comparisons, one in 7,362.31 are expected to match.  With 1,279,041,753 total possible comparisons, we expect a total of around 173,728.33 matching pairs
linker.training.estimate_u_using_random_sampling(max_pairs=5e6)
----- Estimating u probabilities using random sampling -----



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


u probability not trained for first_name_surname - Match on reversed cols: first_name and surname (comparison vector value: 5). This usually means the comparison level was never observed in the training data.



Estimated u probabilities using random sampling



Your model is not yet fully trained. Missing estimates for:
    - first_name_surname (some u values are not trained, no m values are trained).
    - dob (no m values are trained).
    - postcode_fake (no m values are trained).
    - birth_place (no m values are trained).
    - occupation (no m values are trained).
training_blocking_rule = block_on("first_name", "surname")
training_session_names = (
    linker.training.estimate_parameters_using_expectation_maximisation(
        training_blocking_rule, estimate_without_term_frequencies=True
    )
)
----- Starting EM training session -----



Estimating the m probabilities of the model by blocking on:
(l."first_name" = r."first_name") AND (l."surname" = r."surname")

Parameter estimates will be made for the following comparison(s):
    - dob
    - postcode_fake
    - birth_place
    - occupation

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name_surname





Iteration 1: Largest change in params was 0.247 in probability_two_random_records_match


Iteration 2: Largest change in params was -0.0938 in the m_probability of postcode_fake, level `Exact match on full postcode`


Iteration 3: Largest change in params was -0.0236 in the m_probability of birth_place, level `Exact match on birth_place`


Iteration 4: Largest change in params was 0.00967 in the m_probability of birth_place, level `All other comparisons`


Iteration 5: Largest change in params was -0.00467 in the m_probability of birth_place, level `Exact match on birth_place`


Iteration 6: Largest change in params was 0.00267 in the m_probability of birth_place, level `All other comparisons`


Iteration 7: Largest change in params was 0.00186 in the m_probability of dob, level `Abs date difference <= 10 year`


Iteration 8: Largest change in params was 0.00127 in the m_probability of dob, level `Abs date difference <= 10 year`


Iteration 9: Largest change in params was 0.000847 in the m_probability of dob, level `Abs date difference <= 10 year`


Iteration 10: Largest change in params was 0.000563 in the m_probability of dob, level `Abs date difference <= 10 year`


Iteration 11: Largest change in params was 0.000373 in the m_probability of dob, level `Abs date difference <= 10 year`


Iteration 12: Largest change in params was 0.000247 in the m_probability of dob, level `Abs date difference <= 10 year`


Iteration 13: Largest change in params was 0.000163 in the m_probability of dob, level `Abs date difference <= 10 year`


Iteration 14: Largest change in params was 0.000108 in the m_probability of dob, level `Abs date difference <= 10 year`


Iteration 15: Largest change in params was 7.14e-05 in the m_probability of dob, level `Abs date difference <= 10 year`



EM converged after 15 iterations



Your model is not yet fully trained. Missing estimates for:
    - first_name_surname (some u values are not trained, no m values are trained).
training_blocking_rule = block_on("dob")
training_session_dob = (
    linker.training.estimate_parameters_using_expectation_maximisation(
        training_blocking_rule, estimate_without_term_frequencies=True
    )
)
----- Starting EM training session -----



Estimating the m probabilities of the model by blocking on:
l."dob" = r."dob"

Parameter estimates will be made for the following comparison(s):
    - first_name_surname
    - postcode_fake
    - birth_place
    - occupation

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - dob





Iteration 1: Largest change in params was -0.472 in the m_probability of first_name_surname, level `Exact match on first_name_surname_concat`


Iteration 2: Largest change in params was 0.0524 in the m_probability of first_name_surname, level `All other comparisons`


Iteration 3: Largest change in params was 0.0175 in the m_probability of first_name_surname, level `All other comparisons`


Iteration 4: Largest change in params was 0.00537 in the m_probability of first_name_surname, level `All other comparisons`


Iteration 5: Largest change in params was 0.00165 in the m_probability of first_name_surname, level `All other comparisons`


Iteration 6: Largest change in params was 0.000518 in the m_probability of first_name_surname, level `All other comparisons`


Iteration 7: Largest change in params was 0.000164 in the m_probability of first_name_surname, level `All other comparisons`


Iteration 8: Largest change in params was 5.2e-05 in the m_probability of first_name_surname, level `All other comparisons`



EM converged after 8 iterations



Your model is not yet fully trained. Missing estimates for:
    - first_name_surname (some u values are not trained).

The final match weights can be viewed in the match weights chart:

linker.visualisations.match_weights_chart()
linker.evaluation.unlinkables_chart()
df_predict = linker.inference.predict()
df_e = df_predict.as_pandas_dataframe(limit=5)
df_e
Blocking time: 0.65 seconds


Predict time: 1.71 seconds



 -- WARNING --
You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'first_name_surname':
    u values not fully trained
match_weight match_probability unique_id_l unique_id_r first_name_l first_name_r surname_l surname_r first_name_surname_concat_l first_name_surname_concat_r ... bf_birth_place bf_tf_adj_birth_place occupation_l occupation_r gamma_occupation tf_occupation_l tf_occupation_r bf_occupation bf_tf_adj_occupation match_key
0 5.903133 0.983565 Q6105786-11 Q6105786-6 joan j. garson garson joan garson j. garson ... 0.164159 1.000000 anthropologist anatomist 0 0.002056 0.000593 0.107248 1.0 4
1 2.354819 0.836476 Q6105786-11 Q6105786-8 joan j. garson garson joan garson j. garson ... 0.164159 1.000000 anthropologist anatomist 0 0.002056 0.000593 0.107248 1.0 4
2 2.354819 0.836476 Q6105786-11 Q6105786-9 joan ian garson garson joan garson ian garson ... 0.164159 1.000000 anthropologist anatomist 0 0.002056 0.000593 0.107248 1.0 4
3 3.319202 0.908935 Q6105786-11 Q6105786-13 joan j. garson garson joan garson j. garson ... 0.164159 1.000000 anthropologist None -1 0.002056 NaN 1.000000 1.0 4
4 16.881661 0.999992 Q6241382-1 Q6241382-11 john joan jackson jackson john jackson joan jackson ... 147.489511 17.689372 author None -1 0.003401 NaN 1.000000 1.0 4

5 rows × 42 columns

You can also view rows in this dataset as a waterfall chart as follows:

records_to_plot = df_e.to_dict(orient="records")
linker.visualisations.waterfall_chart(records_to_plot, filter_nulls=False)
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    df_predict, threshold_match_probability=0.95
)
Completed iteration 1, root rows count 858


Completed iteration 2, root rows count 202


Completed iteration 3, root rows count 68


Completed iteration 4, root rows count 9


Completed iteration 5, root rows count 1


Completed iteration 6, root rows count 0
from IPython.display import IFrame

linker.visualisations.cluster_studio_dashboard(
    df_predict,
    clusters,
    "dashboards/50k_cluster.html",
    sampling_method="by_cluster_size",
    overwrite=True,
)


IFrame(src="./dashboards/50k_cluster.html", width="100%", height=1200)

linker.evaluation.accuracy_analysis_from_labels_column(
    "cluster", output_type="accuracy", match_weight_round_to_nearest=0.02
)
Blocking time: 1.10 seconds


Predict time: 1.54 seconds



 -- WARNING --
You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'first_name_surname':
    u values not fully trained
records = linker.evaluation.prediction_errors_from_labels_column(
    "cluster",
    threshold_match_probability=0.999,
    include_false_negatives=False,
    include_false_positives=True,
).as_record_dict()
linker.visualisations.waterfall_chart(records)
Blocking time: 0.86 seconds


Predict time: 0.30 seconds



 -- WARNING --
You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'first_name_surname':
    u values not fully trained
# Some of the false negatives will be because they weren't detected by the blocking rules
records = linker.evaluation.prediction_errors_from_labels_column(
    "cluster",
    threshold_match_probability=0.5,
    include_false_negatives=True,
    include_false_positives=False,
).as_record_dict(limit=50)

linker.visualisations.waterfall_chart(records)
Blocking time: 0.92 seconds


Predict time: 0.30 seconds



 -- WARNING --
You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'first_name_surname':
    u values not fully trained