Skip to content

Quick and dirty persons model

Historical people: Quick and dirty

This example shows how to get some initial record linkage results as quickly as possible.

There are many ways to improve the accuracy of this model. But this may be a good place to start if you just want to give Splink a try and see what it's capable of.

Open In Colab

from splink.datasets import splink_datasets

df = splink_datasets.historical_50k
df.head(5)
unique_id cluster full_name first_and_surname first_name surname dob birth_place postcode_fake gender occupation
0 Q2296770-1 Q2296770 thomas clifford, 1st baron clifford of chudleigh thomas chudleigh thomas chudleigh 1630-08-01 devon tq13 8df male politician
1 Q2296770-2 Q2296770 thomas of chudleigh thomas chudleigh thomas chudleigh 1630-08-01 devon tq13 8df male politician
2 Q2296770-3 Q2296770 tom 1st baron clifford of chudleigh tom chudleigh tom chudleigh 1630-08-01 devon tq13 8df male politician
3 Q2296770-4 Q2296770 thomas 1st chudleigh thomas chudleigh thomas chudleigh 1630-08-01 devon tq13 8hu None politician
4 Q2296770-5 Q2296770 thomas clifford, 1st baron chudleigh thomas chudleigh thomas chudleigh 1630-08-01 devon tq13 8df None politician
from splink import block_on, SettingsCreator
import splink.comparison_library as cl


settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=[
        block_on("full_name"),
        block_on("substr(full_name,1,6)", "dob", "birth_place"),
        block_on("dob", "birth_place"),
        block_on("postcode_fake"),
    ],
    comparisons=[
        cl.ForenameSurnameComparison(
            "first_name",
            "surname",
            forename_surname_concat_col_name="first_and_surname",
        ),
        cl.DateOfBirthComparison(
            "dob",
            input_is_string=True,
        ),
        cl.LevenshteinAtThresholds("postcode_fake", 2),
        cl.JaroWinklerAtThresholds("birth_place", 0.9).configure(
            term_frequency_adjustments=True
        ),
        cl.ExactMatch("occupation").configure(term_frequency_adjustments=True),
    ],
)
from splink import Linker, DuckDBAPI


linker = Linker(df, settings, db_api=DuckDBAPI(), set_up_basic_logging=False)
deterministic_rules = [
    "l.full_name = r.full_name",
    "l.postcode_fake = r.postcode_fake and l.dob = r.dob",
]

linker.training.estimate_probability_two_random_records_match(
    deterministic_rules, recall=0.6
)
linker.training.estimate_u_using_random_sampling(max_pairs=2e6)
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
results = linker.inference.predict(threshold_match_probability=0.9)
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))



 -- WARNING --
You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'first_name_surname':
    m values not fully trained
Comparison: 'first_name_surname':
    u values not fully trained
Comparison: 'dob':
    m values not fully trained
Comparison: 'postcode_fake':
    m values not fully trained
Comparison: 'birth_place':
    m values not fully trained
Comparison: 'occupation':
    m values not fully trained
results.as_pandas_dataframe(limit=5)
match_weight match_probability unique_id_l unique_id_r first_name_l first_name_r surname_l surname_r first_and_surname_l first_and_surname_r ... gamma_postcode_fake birth_place_l birth_place_r gamma_birth_place occupation_l occupation_r gamma_occupation full_name_l full_name_r match_key
0 3.170005 0.900005 Q7412607-1 Q7412607-3 samuel samuel shelley shelley samuel shelley samuel shelley ... 0 whitechapel city of london 0 illuminator illuminator 1 samuel shelley samuel shelley 0
1 3.170695 0.900048 Q15997578-4 Q15997578-7 job wilding wilding None job wilding wilding ... -1 wrexham wrexham 2 association football player association football player 1 job wilding wilding 2
2 3.170695 0.900048 Q15997578-2 Q15997578-7 job wilding wilding None job wilding wilding ... -1 wrexham wrexham 2 association football player association football player 1 job wilding wilding 2
3 3.170695 0.900048 Q15997578-1 Q15997578-7 job wilding wilding None job wilding wilding ... -1 wrexham wrexham 2 association football player association football player 1 job wilding wilding 2
4 3.172553 0.900164 Q5726641-11 Q5726641-8 henry harry page paige henry page harry paige ... 2 staffordshire moorlands staffordshire moorlands 2 cricketer cricketer 1 henry page harry paige 3

5 rows × 26 columns