Automated Data Testing¶
In addition to being used directly in your own projects, you can also use Datatest with a testing framework like pytest or unittest. Automated testing of data is a good solution when you need to validate and manage:
batch data before loading
datasets for an important project
datasets intended for publication
status of a long-lived, critical data system
comparisons between your data and some reference data
data migration projects
complex data-wrangling processes
Data testing is a form of acceptance testing—akin to operational acceptance testing. Using an incremental approach, we check that data properties satisfy certain requirements. A test suite should include as many tests as necessary to determine if a dataset is fit for purpose.
Pytest¶
With pytest
, you can use datatest functions and classes
just as you would in any other context. And you can run pytest
using its normal, console interface (see Usage and Invocations).
To facilitate incremental testing, datatest implements a “mandatory” marker to stop the session early when a mandatory test fails:
@pytest.mark.mandatory
def test_columns():
...
You can also use the -x
option to stop testing after the first
failure of any test:
pytest -x
If needed, you can use --ignore-mandatory
to ignore “mandatory”
markers and continue testing even when a mandatory test fails:
pytest --ignore-mandatory
Pytest Samples¶
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | #!/usr/bin/env python3
import pytest
import pandas as pd
import datatest as dt
from datatest import (
Missing,
Extra,
Invalid,
Deviation,
)
@pytest.fixture(scope='session')
@dt.working_directory(__file__)
def df():
return pd.read_csv('example.csv') # Returns DataFrame.
@pytest.mark.mandatory
def test_column_names(df):
required_names = {'A', 'B', 'C'}
dt.validate(df.columns, required_names)
def test_a(df):
requirement = {'x', 'y', 'z'}
dt.validate(df['A'], requirement)
# ...add more tests here...
if __name__ == '__main__':
import sys
sys.exit(pytest.main(sys.argv))
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | #!/usr/bin/env python3
import pytest
import pandas as pd
import datatest as dt
from datatest import (
Missing,
Extra,
Invalid,
Deviation,
)
@pytest.fixture(scope='session')
@dt.working_directory(__file__)
def df():
return pd.read_csv('example.csv') # Returns DataFrame.
@pytest.fixture(scope='session', autouse=True)
def pandas_integration():
dt.register_accessors()
@pytest.mark.mandatory
def test_column_names(df):
required_names = {'A', 'B', 'C'}
df.columns.validate(required_names)
def test_a(df):
requirement = {'x', 'y', 'z'}
df['A'].validate(requirement)
# ...add more tests here...
if __name__ == '__main__':
import sys
sys.exit(pytest.main(sys.argv))
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | #!/usr/bin/env python3
import pytest
import squint
from datatest import (
validate,
accepted,
working_directory,
Missing,
Extra,
Invalid,
Deviation,
)
@pytest.fixture(scope='session')
@working_directory(__file__)
def select():
return squint.Select('example.csv')
@pytest.mark.mandatory
def test_column_names(select):
required_names = {'A', 'B', 'C'}
validate(select.fieldnames, required_names)
def test_a(select):
requirement = {'x', 'y', 'z'}
validate(select('A'), requirement)
# ...add more tests here...
if __name__ == '__main__':
import sys
sys.exit(pytest.main(sys.argv))
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | #!/usr/bin/env python3
import pytest
import sqlite3
from datatest import (
validate,
accepted,
working_directory,
Missing,
Extra,
Invalid,
Deviation,
)
@pytest.fixture(scope='session')
def connection():
with working_directory(__file__):
conn = sqlite3.connect('example.sqlite3')
yield conn
conn.close()
@pytest.fixture(scope='function')
def cursor(connection):
cur = connection.cursor()
yield cur
cur.close()
@pytest.mark.mandatory
def test_column_names(cursor):
cursor.execute('SELECT * FROM mytable LIMIT 0;')
column_names = [item[0] for item in cursor.description]
required_names = {'A', 'B', 'C'}
validate(column_names, required_names)
def test_a(cursor):
cursor.execute('SELECT A FROM mytable;')
requirement = {'x', 'y', 'z'}
validate(cursor, requirement)
# ...add more tests here...
if __name__ == '__main__':
import sys
sys.exit(pytest.main(sys.argv))
|
Unittest¶
Datatest provides a handful of tools for integrating data validation
with a unittest
test suite. While normal datatest functions work
fine, this integration provides an interface that is more consistent
with established unittest conventions (e.g., “mixedCase” methods,
decorators, and helper classes).
Datatest’s DataTestCase
extends unittest.TestCase
to provide unittest-style wrappers for validation and acceptance
(see reference docs for full details):
from datatest import DataTestCase, Extra
class TestMyData(DataTestCase):
def test_one(self):
data = ['A', 'B', 'C', 'D']
requirement = {'A', 'B'}
with self.accepted(Extra):
self.assertValid(data, requirement)
Datatest includes a @mandatory
decorator to help
with incremental testing:
from datatest import DataTestCase, mandatory
class TestMyData(DataTestCase):
@mandatory
def test_one(self):
data = ['A', 'A', 'B', 'B']
requirement = {'A', 'B'}
self.assertValid(data, requirement)
Datatest also provides a main()
function and test runner that
runs tests in decleration order (by the line number on which each
test is defined). You can invoke datatest’s runner using:
python -m datatest
In addition to using the @mandatory
decorator,
you can use the -f
option to stop after any failing test:
python -m datatest -f
You can also use features directly from unittest
.
This includes decorators like @skip()
and @skipIf()
, functions like
addModuleCleanup()
,
and features like Class and Module Fixtures:
import unittest
from datatest import DataTestCase
class TestMyData(DataTestCase):
@unittest.skip('Data not yet collected.')
def test_one(self):
data = ...
requirement = ...
self.assertValid(data, requirement)
Unittest Samples¶
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | #!/usr/bin/env python3
import pandas as pd
import datatest as dt
from datatest import (
Missing,
Extra,
Invalid,
Deviation,
)
@dt.working_directory(__file__)
def setUpModule():
global df
df = pd.read_csv('example.csv')
class TestMyData(dt.DataTestCase):
@dt.mandatory
def test_column_names(self):
required_names = {'A', 'B', 'C'}
self.assertValid(df.columns, required_names)
def test_a(self):
requirement = {'x', 'y', 'z'}
self.assertValid(df['A'], requirement)
# ...add more tests here...
if __name__ == '__main__':
from datatest import main
main()
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | #!/usr/bin/env python3
import pandas as pd
import datatest as dt
from datatest import (
Missing,
Extra,
Invalid,
Deviation,
)
@dt.working_directory(__file__)
def setUpModule():
global df
df = pd.read_csv('example.csv')
dt.register_accessors() # Register pandas accessors.
class TestMyData(dt.DataTestCase):
@dt.mandatory
def test_column_names(self):
required_names = {'A', 'B', 'C'}
df.columns.validate(required_names)
def test_a(self):
requirement = {'x', 'y', 'z'}
df['A'].validate(requirement)
# ...add more tests here...
if __name__ == '__main__':
from datatest import main
main()
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | #!/usr/bin/env python3
import squint
from datatest import (
DataTestCase,
mandatory,
working_directory,
Missing,
Extra,
Invalid,
Deviation,
)
@working_directory(__file__)
def setUpModule():
global select
select = squint.Select('example.csv')
class TestMyData(DataTestCase):
@mandatory
def test_column_names(self):
required_names = {'A', 'B', 'C'}
self.assertValid(select.fieldnames, required_names)
def test_a(self):
requirement = {'x', 'y', 'z'}
self.assertValid(select('A'), requirement)
# ...add more tests here...
if __name__ == '__main__':
from datatest import main
main()
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | #!/usr/bin/env python3
import sqlite3
from datatest import (
DataTestCase,
mandatory,
working_directory,
Missing,
Extra,
Invalid,
Deviation,
)
@working_directory(__file__)
def setUpModule():
global connection
connection = sqlite3.connect('example.sqlite3')
def tearDownModule():
connection.close()
class MyTest(DataTestCase):
def setUp(self):
cursor = connection.cursor()
self.addCleanup(cursor.close)
self.cursor = cursor
@mandatory
def test_column_names(self):
self.cursor.execute('SELECT * FROM mytable LIMIT 0;')
column_names = [item[0] for item in self.cursor.description]
required_names = {'A', 'B', 'C'}
self.assertValid(column_names, required_names)
def test_a(self):
self.cursor.execute('SELECT A FROM mytable;')
requirement = {'x', 'y', 'z'}
self.assertValid(self.cursor, requirement)
if __name__ == '__main__':
from datatest import main
main()
|
Data for Script Samples¶
The test samples given on this page were written to check the following dataset:
A
B
C
x
foo
20
x
foo
30
y
foo
10
y
bar
20
z
bar
10
z
bar
10