Skip to content

Documentation for SplinkDataFrame object

Abstraction over dataframe to handle basic operations like retrieving data and retrieving column names, which need different implementations depending on whether it's a spark dataframe, sqlite table etc.

Uses methods like as_pandas_dataframe() and as_record_dict() to retrieve data

Source code in splink/splink_dataframe.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class SplinkDataFrame:
    """Abstraction over dataframe to handle basic operations like retrieving data and
    retrieving column names, which need different implementations depending on whether
    it's a spark dataframe, sqlite table etc.

    Uses methods like `as_pandas_dataframe()` and `as_record_dict()` to retrieve data
    """

    def __init__(self, templated_name, physical_name):
        self.templated_name = templated_name
        self.physical_name = physical_name

    @property
    def columns(self):
        pass

    @property
    def columns_escaped(self):
        cols = self.columns
        return [c.name() for c in cols]

    def validate():
        pass

    def _random_sample_sql(percent):
        raise NotImplementedError("Random sample sql not implemented for this linker")

    @property
    def physical_and_template_names_equal(self):
        return self.templated_name == self.physical_name

    def _check_drop_table_created_by_splink(self, force_non_splink_table=False):

        if not self.physical_name.startswith("__splink__"):
            if not force_non_splink_table:
                raise ValueError(
                    f"You've asked to drop table {self.physical_name} from your "
                    "database which is not a table created by Splink.  If you really "
                    "want to drop this table, you can do so by setting "
                    "force_non_splink_table=True"
                )
        logger.debug(
            f"Dropping table with templated name {self.templated_name} and "
            f"physical name {self.physical_name}"
        )

    def drop_table_from_database(self, force_non_splink_table=False):
        raise NotImplementedError(
            "Drop table from database not implemented for this linker"
        )

    def as_record_dict(self, limit=None):
        pass

    def as_pandas_dataframe(self, limit=None):
        """Return the dataframe as a pandas dataframe.

        This can be computationally expensive if the dataframe is large.

        Args:
            limit (int, optional): If provided, return this number of rows (equivalent
            to a limit statement in SQL). Defaults to None, meaning return all rows

        Returns:
            pandas.DataFrame: pandas Dataframe
        """
        import pandas as pd

        return pd.DataFrame(self.as_record_dict(limit=limit))

    def __repr__(self):
        return (
            f"Table name in database: `{self.physical_name}`\n"
            "\nTo retrieve records, you can call the following methods on this object:"
            "\n`.as_record_dict(limit=5)` or "
            "`.as_pandas_dataframe(limit=5)`.\n"
            "\nYou may omit the `limit` argument to return all records."
            "\n\nThis table represents the following splink entity: "
            f"{self.templated_name}"
        )
Source code in splink/splink_dataframe.py
53
54
55
56
def drop_table_from_database(self, force_non_splink_table=False):
    raise NotImplementedError(
        "Drop table from database not implemented for this linker"
    )

Return the dataframe as a pandas dataframe.

This can be computationally expensive if the dataframe is large.

Parameters:

Name Type Description Default
limit int

If provided, return this number of rows (equivalent

None

Returns:

Type Description

pandas.DataFrame: pandas Dataframe

Source code in splink/splink_dataframe.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def as_pandas_dataframe(self, limit=None):
    """Return the dataframe as a pandas dataframe.

    This can be computationally expensive if the dataframe is large.

    Args:
        limit (int, optional): If provided, return this number of rows (equivalent
        to a limit statement in SQL). Defaults to None, meaning return all rows

    Returns:
        pandas.DataFrame: pandas Dataframe
    """
    import pandas as pd

    return pd.DataFrame(self.as_record_dict(limit=limit))