Skip to content

Quick and dirty persons model

Historical people: Quick and dirty

This example shows how to get some initial record linkage results as quickly as possible.

There are many ways to improve the accuracy of this model. But this may be a good place to start if you just want to give Splink a try and see what it's capable of.

import pandas as pd 
df = pd.read_parquet("./data/historical_figures_with_errors_50k.parquet")
df.head(5)
uncorrupted_record cluster full_name dob birth_place postcode_fake lat lng gender occupation unique_id
0 True Q2296770 thomas clifford, 1st baron clifford of chudleigh 1630-08-01 Devon TQ13 8DF 50.692449 -3.813964 male politician Q2296770-1
1 False Q2296770 thomas of chudleigh 1630-08-01 Devon TQ13 8DF 50.692449 -3.813964 male politician Q2296770-2
2 False Q2296770 tom 1st baron clifford of chudleigh 1630-08-01 Devon TQ13 8DF 50.692449 -3.813964 male politician Q2296770-3
3 False Q2296770 thomas 1st chudleigh 1630-08-01 Devon TQ13 8HU 50.687638 -3.895877 None politician Q2296770-4
4 False Q2296770 thomas clifford, 1st baron chudleigh 1630-08-01 Devon TQ13 8DF 50.692449 -3.813964 None politician Q2296770-5
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
    "l.full_name = r.full_name",
    "substr(l.full_name,1,6) = substr(r.full_name,1,6) and l.dob = r.dob and l.birth_place = r.birth_place",
    "l.dob = r.dob and l.birth_place = r.birth_place",
    "l.postcode_fake = r.postcode_fake",
    ],
    "comparisons": [
        cl.levenshtein_at_thresholds("full_name", [1,3,5], term_frequency_adjustments=True),
        cl.levenshtein_at_thresholds("dob", [1,2], term_frequency_adjustments=True),
        cl.levenshtein_at_thresholds("postcode_fake", 2),
        cl.exact_match("birth_place", term_frequency_adjustments=True),
        cl.exact_match("occupation",  term_frequency_adjustments=True),
    ],       

}
linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
    "l.full_name = r.full_name",
    "l.postcode_fake = r.postcode_fake and l.dob = r.dob",
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)
linker.estimate_u_using_random_sampling(target_rows=2e6)
results = linker.predict(threshold_match_probability=0.9)

 -- WARNING --
You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'full_name':
    m values not fully trained
Comparison: 'dob':
    m values not fully trained
Comparison: 'postcode_fake':
    m values not fully trained
Comparison: 'birth_place':
    m values not fully trained
Comparison: 'occupation':
    m values not fully trained

results.as_pandas_dataframe(limit=5)
match_weight match_probability unique_id_l unique_id_r full_name_l full_name_r gamma_full_name dob_l dob_r gamma_dob postcode_fake_l postcode_fake_r gamma_postcode_fake birth_place_l birth_place_r gamma_birth_place occupation_l occupation_r gamma_occupation match_key
0 31.481528 1.000000 Q90404618-1 Q90404618-3 emlie clifford emlie clifford 4 1861-01-01 1861-01-01 3 WR11 7QP WR11 7QW 1 Wychavon Wychavon 1 playwright playwright 1 0
1 31.481528 1.000000 Q90404618-2 Q90404618-3 emlie clifford emlie clifford 4 1861-01-01 1861-01-01 3 WR11 7QP WR11 7QW 1 Wychavon Wychavon 1 playwright playwright 1 0
2 14.090741 0.999943 Q2516590-3 Q2516590-9 william watts william watts 4 1860-06-07 NaN -1 SY5 7NT SY5 7NT 2 Shropshire NaN -1 geologist NaN -1 0
3 54.751297 1.000000 Q631006-1 Q631006-2 moses gaster moses gaster 4 1856-09-17 1856-09-17 3 EX20 3PZ EX20 3PZ 2 Bucharest Bucharest 1 rabbi rabbi 1 0
4 21.428205 1.000000 Q7795446-2 Q7795446-3 thomas barry thomas barry 4 1560-01-01 1560-01-01 3 CF14 5GH CF14 6TQ 0 Cardiff Cardiff 1 judge judge 1 0