Commit 45f4c2ab authored by damien clochard's avatar damien clochard

Merge branch 'master' of gitlab.com:dalibo/postgresql_anonymizer

parents ec7d13e6 1a9cd22a
Pipeline #237301038 passed with stages
in 4 minutes and 53 seconds
......@@ -41,7 +41,8 @@ lint-bash:
before_script:
- echo "disable the before_script"
script:
- apt-get update && apt-get install -y shellcheck
- apt-get update
- apt-get install -y --no-install-recommends shellcheck
- shellcheck bin/standalone.sh
- shellcheck bin/pg_dump_anon.sh
- shellcheck docker/anon.sh
......@@ -64,7 +65,8 @@ make9.5:
stage: build
image: postgres:9.5
script:
- apt-get update && apt-get install -y make gcc postgresql-server-dev-9.5 pgxnclient
- apt-get update
- apt-get install -y --no-install-recommends make gcc postgresql-server-dev-9.5 pgxnclient
- pgxn install ddlx
- make extension
- make install
......@@ -84,7 +86,8 @@ make9.6:
stage: build
image: postgres:9.6
script:
- apt-get update && apt-get install -y make gcc postgresql-server-dev-9.6 pgxnclient
- apt-get update
- apt-get install -y --no-install-recommends make gcc gcc-6-multilib postgresql-server-dev-9.6 pgxnclient
- pgxn install ddlx
- make extension
- make install
......@@ -102,7 +105,8 @@ make10:
stage: build
image: postgres:10
script:
- apt-get update && apt-get install -y make gcc postgresql-server-dev-10 pgxnclient
- apt-get update
- apt-get install -y --no-install-recommends make gcc gcc-6-multilib postgresql-server-dev-10 pgxnclient
- pgxn install ddlx
- make extension
- make install
......@@ -121,7 +125,8 @@ make11:
stage: build
image: postgres:11
script:
- apt-get update && apt-get install -y make gcc postgresql-server-dev-11 pgxnclient
- apt-get update
- apt-get install -y --no-install-recommends make gcc postgresql-server-dev-11 pgxnclient
- pgxn install ddlx
- make extension
- make install
......@@ -139,7 +144,8 @@ make12:
stage: build
image: postgres:12
script:
- apt-get update && apt-get install -y make gcc git postgresql-server-dev-12 pgxnclient
- apt-get update
- apt-get install -y --no-install-recommends make gcc git postgresql-server-dev-12 pgxnclient
- pgxn install ddlx
# - git clone https://github.com/lacanoid/pgddl.git
# - make -C pgddl && make -C pgddl install
......@@ -159,7 +165,8 @@ make13:
stage: build
image: postgres:13
script:
- apt-get update && apt-get install -y make gcc postgresql-server-dev-13
- apt-get update
- apt-get install -y --no-install-recommends make gcc postgresql-server-dev-13
- make extension
- make install
- psql -c "ALTER SYSTEM SET session_preload_libraries = 'anon'"
......@@ -197,7 +204,8 @@ standalone12:
before_script:
- echo "disable the before_script"
script:
- apt-get update && apt-get install -y make git postgresql-server-dev-all
- apt-get update
- apt-get install -y --no-install-recommends make git postgresql-server-dev-all
- make anon_standalone.sql
- export PGPASSWORD=$POSTGRES_PASSWORD
- $PSQL -h "postgres" -U "$POSTGRES_USER" -d "$POSTGRES_DB" -f anon_standalone.sql
......@@ -210,7 +218,8 @@ standalone11:
before_script:
- echo "disable the before_script"
script:
- apt-get update && apt-get install -y make git postgresql-server-dev-all
- apt-get update
- apt-get install -y --no-install-recommends make git postgresql-server-dev-all
- make anon_standalone.sql
- export PGPASSWORD=$POSTGRES_PASSWORD
- $PSQL -h "postgres" -U "$POSTGRES_USER" -d "$POSTGRES_DB" -f anon_standalone.sql
......@@ -239,7 +248,8 @@ blackbox:
demo:
stage: test
script:
- apt-get update && apt-get install -y make gcc postgresql-server-dev-11 postgresql-contrib-11 pgxnclient
- apt-get update
- apt-get install -y --no-install-recommends make gcc postgresql-server-dev-11 postgresql-contrib-11 pgxnclient
- pgxn install ddlx
- make extension
- make install
......@@ -292,7 +302,7 @@ pgxn:
before_script:
- echo 'Disable before_script.'
script:
- apk update && apk add make git zip
- apk add -U make git zip
- make pgxn
artifacts:
paths:
......@@ -307,7 +317,8 @@ test_pgxn:
before_script:
- echo "disable the before_script"
script:
- apt-get update && apt-get install -y make gcc postgresql-server-dev-11 pgxnclient
- apt-get update
- apt-get install -y --no-install-recommends make gcc postgresql-server-dev-11 pgxnclient
- pgxn install postgresql_anonymizer
when: manual
......@@ -318,9 +329,10 @@ test_pgxn_ubuntu_pg95:
before_script:
- echo "disable the before_script"
script:
- apt-get update && apt-get install -y make gcc postgresql-common pgxnclient gnupg ca-cacert
- apt-get update
- apt-get install -y --no-install-recommends make gcc postgresql-common pgxnclient gnupg ca-cacert
- yes '' | sh /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh bionic
- apt-get install -y postgres-9.5 postgres-9.5-contrib postgresql-server-dev-9.5
- apt-get install -y --no-install-recommends postgres-9.5 postgres-9.5-contrib postgresql-server-dev-9.5
- export PGDATA=/var/lib/postgresql/data
- export PGUSER=postgres
- su postgres -c /usr/lib/postgresql/9.5/bin/initdb
......
......@@ -6,10 +6,11 @@ FROM postgres:$PG_MAJOR_VERSION
# used in any instruction after a FROM. We need to declare it again.
ARG PG_MAJOR_VERSION
RUN apt-get update && apt-get install -y \
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
make \
postgresql-server-dev-$PG_MAJOR_VERSION\
postgresql-server-dev-$PG_MAJOR_VERSION \
libc6-dev \
wget \
&& rm -rf /var/lib/apt/lists/*
......
FROM postgres:9.6
RUN apt-get update && apt-get install -y \
make \
postgresql-server-dev-9.6 \
&& rm -rf /var/lib/apt/lists/*
COPY anon* /usr/share/postgresql/9.6/extension/
......@@ -187,16 +187,17 @@ All new connections to the database can now use the extension.
Install in the cloud
------------------------------------------------------------------------------
> **DISCLAIMER** if privacy and anonymity are a concern to you, hosting your
> data on someone else's computer is probably not a clever idea....
Generally Database As A Service operators ( such as Amazon RDS ) do not allow
their clients to load any extension. Instead they support only a limited subset
of extensions, such as PostGIS or pgcrypto. You can ask them if they plan to
support this one in the near future, but you shouldn't bet your life on it 😃
However this tool is set of `plpgsql` functions, which means should you be able
to install it directly without declaring an extension.
> **WARNING** This extension was never really intended to work on Database As A
> Service platforms ( such as Amazon RDS ). It just happens to work currently
> using the `standalone` method described below. In future versions, we may
> introduce features that will force us to drop the suppport for this. If
> privacy and anonymity are a concern to you, we encourage you to contact the
> customer services of these plateforms and ask them if they plan to support
> this extension.
That being said, currently `PostgreSQL Anonymizer` is a set of `plpgsql` functions,
which means should you be able to install it directly without declaring
an extension.
Here's a few steps to try it out:
......
Security
===============================================================================
Permissions
------------------------------------------------------------------------------
Here's an overview of what users can do depending on the role they have:
| Action | Superuser | Owner | Masked Role |
| :--------------------------------------- | :-------: | :---: | :---------: |
| Create the extension | Yes | | |
| Drop the extension | Yes | | |
| Init the extension | Yes | | |
| Reset the extension | Yes | | |
| Configure the extension | Yes | Yes | |
| Start dynamic masking | Yes | Yes | |
| Stop dynamic masking | Yes | Yes | |
| Create a table | Yes | Yes | |
| Declare a masking rule | Yes | Yes | |
| Insert, delete, update a row | Yes | Yes | |
| Static Masking | Yes | Yes | |
| Select the real data | Yes | Yes | |
| Regular Dump | Yes | Yes | |
| Select the masked data | Yes | Yes | Yes |
| Anonymous Dump | Yes | Yes | Yes |
Functions security context
Security context of the functions
------------------------------------------------------------------------------
Most functions of this extension are declared with the `SECURITY INVOKER` tag.
This means that theses functions are executed with the privileges of the user
that calls it. This is an important restriction.
Most of the functions of this extension are declared with the `SECURITY INVOKER`
tag.
This means that these functions are executed with the privileges of the user
that calls them. This is an important restriction.
This extension contains a few functions declared with the tag `SECURITY DEFINER`.
This extension contains another few functions declared with the tag
`SECURITY DEFINER`.
......@@ -64,7 +64,7 @@ If you don't provide the connection password to `pg_dump_anon` using the
`--password` option, you may have to type the password multipe times.To
avoid this, you can either [define the $PGPASS variable] or place your
password in a [.pgpass] file.
*
[define the $PGPASS variable]: https://www.postgresql.org/docs/current/libpq-envars.html
......@@ -75,8 +75,8 @@ The version 0.3 of PostgreSQL Anonymizer introduced a function called
`anon.dump()`. This function is extremely slow. Since version 0.6, it has
been deprecated and it is not supported anymore.
The function is kept as is for backward compatibility. It will be probably be
remove from one ogf the forthcoming versions.
The function is kept as is for backward compatibility. It will probably be
removed from one of the forthcoming versions.
Again: do not use this function ! To dump the masked data, use the
`pg_dump_anon` command line tool as described above.
......@@ -17,7 +17,7 @@ The data can be altered with several techniques:
* **Deletion** or **Nullification** simply removes data.
* **Static Subtitution** consistently replaces the data with a generic
values. For instance: replacing all values of TEXT column with the value
value. For instance: replacing all values of a TEXT column with the value
"CONFIDENTIAL".
* **Variance** is the action of "shifting" dates and numeric values. For
......@@ -45,7 +45,7 @@ The data can be altered with several techniques:
them coherent.
Please note that **Encryption** and **Hashing** are not considered as
anonymization techniques, because they requires additional information
(a private key or a salt) and if this information is stolen, then the authentic
anonymization techniques, because they require additional information
(a private key or a salt), and if this information is stolen, then the authentic
data can be revealed.
Searching for Identifiers
===============================================================================
> WARNING : This is feature is at an early stage of development.
> WARNING : This feature is at an early stage of development.
As we've seen previously, this extension makes it very easy to
[declare the masking rules].
[declare masking rules].
[declare masking rules]: declare_masking_rules/
But of course when you're creating an anonymization strategy, the hard part is
to scan the database model to find which columns contains direct and indirect
identifiers and then decide how these identifiers should be masked.
However, when you create an anonymization strategy, the hard part is
scanning the database model to find which columns contains direct and indirect
identifiers, and then decide how these identifiers should be masked.
The extension provides a `detect()` function that will search for common
identifiers names based on dictionary. For now, 2 dictionaries are available:
english ('en_US') and french ('fr_FR'). By default the english dictionary is
identifier names based on a dictionary. For now, 2 dictionaries are available:
english ('en_US') and french ('fr_FR'). By default, the english dictionary is
used:
```sql
......@@ -27,20 +27,20 @@ used:
customer | id | account_id | t
```
The identifiers categories are based on the [HIPAA classification].
The identifier categories are based on the [HIPAA classification].
[HIPAA classification]: https://www.luc.edu/its/aboutits/itspoliciesguidelines/hipaainformation/18hipaaidentifiers/
Limitations
---------------------------------------------------------------------------------
This is an heuristic method in the sense that it may report usefull information
This is an heuristic method in the sense that it may report usefull information,
but it is based on a pragmatic approach that can lead to detection mistakes,
especially:
* `false positive`: a column is reported as an identifiers but it is not.
* `false negative`: a column contains identifiers but it is not reported
* `false positive`: a column is reported as an identifier, but it is not.
* `false negative`: a column contains identifiers, but it is not reported
The second one is of course more problematic. In any case, you should not
consider this function as an helping tool but aknowledge that you still need
The second one is of course more problematic. In any case, you should only
consider this function as an helping tool, and acknowledge that you still need
to review the entire database model in search of hidden identifiers.
Hide sensible data from a "masked" user
Hide sensitive data from a "masked" user
===============================================================================
You can hide some data from a role by declaring this role as a "MASKED".
You can hide some data from a role by declaring this role as a "MASKED" one.
Other roles will still access the original data.
**Example**:
......@@ -49,7 +49,8 @@ IS 'MASKED WITH FUNCTION anon.partial(phone,2,$$******$$,2)';
Step 4 : Connect with the masked user
```sql
=# \! psql peopledb -U skynet -c 'SELECT * FROM people;'
=# \c - skynet
=> SELECT * FROM people;
id | fistname | lastname | phone
----+----------+-----------+------------
T1 | Sarah | Stranahan | 06******11
......@@ -60,7 +61,7 @@ How to change the type of a masked column
------------------------------------------------------------------------------
When dynamic masking is activated, you are not allowed to change the datatype
on a column is there's a mask upon it.
of a column if there's a mask upon it.
To modify a masked column, you need to switch of temporarily the masking engine
like this:
......@@ -87,7 +88,7 @@ psql: ERROR: cannot drop table people because other objects depend on it
DETAIL: view mask.company depends on table people
```
To effectively remove the table, it is necessary to add the `CASCADE` options
To effectively remove the table, it is necessary to add the `CASCADE` option,
so that the masking view will be dropped too:
```sql
......@@ -109,10 +110,10 @@ SELECT start_dynamic_masking('sales');
```
**However** static masking with `anon.anonymize()`and anonymous export
with `anon.dump()` will work fine will multiple schemas.
with `anon.dump()` will work fine with multiple schemas.
### Performances
Dynamic Masking is now to be very slow with some queries, especially if you
trying to join 2 tables with a masked foreign key using hashing or
pseudonymisation.
Dynamic Masking is known to be very slow with some queries, especially if you
try to join 2 tables on a masked key using hashing or
pseudonymization.
......@@ -2,7 +2,7 @@ Generalization
===============================================================================
Reducing the accuracy of sensible data
Reducing the accuracy of sensitive data
--------------------------------------------------------------------------------
The idea of generalization is to replace data with a broader, less accurate
......@@ -13,7 +13,7 @@ the data remains true while avoiding the risk of re-identification.
Generalization is a way to achieve [k-anonymity].
PostgreSQL can handle generalization very easily with the [RANGE] data types,
a very powefull way to store and manipulate a set of values contained between
a very powerfull way to store and manipulate a set of values contained between
a lower and an upper bound.
[k-anonymity]: #k-anonymity
......@@ -79,7 +79,7 @@ Generalization Functions
--------------------------------------------------------------------------------
PostgreSQL Anonymizer provides 6 generalization functions. One for each [RANGE]
type. Generally these functions take the original value as the first parameter
type. Generally these functions take the original value as the first parameter,
and a second parameter for the length of each step.
For numeric values :
......@@ -125,7 +125,7 @@ very far from the average salary. Therefore this person is probably the CEO
of the company.
With generalization, this is important because the size of the range (the "step")
must be wide enough to avoid identify one single individual.
must be wide enough to prevent the identification of one single individual.
[k-anonymity] is a way to assess this risk.
......@@ -136,7 +136,7 @@ By definition, with generalization the data remains true, but the column type
is changed.
This means that the transformation is not transparent, and therefore it cannot
be used for [dynamic masking].
be used with [dynamic masking].
[dynamic masking]: dynamic_masking/
......@@ -174,4 +174,4 @@ The higher the value, the better...
References
--------------------------------------------------------------------------------
* [How Google Anonymizes Data]: https://policies.google.com/technologies/anonymization
* [How Google Anonymizes Data](https://policies.google.com/technologies/anonymization)
......@@ -22,13 +22,13 @@ different ways :
In addition, various [Masking Functions] are available : randomization, faking,
partial scrambling, shufflin, noise or even your own custom function !
partial scrambling, shuffling, noise or even your own custom function!
Beyond masking, it is also possible to use a 4th approach called [Generalization]
Beyond masking, it is also possible to use a fourth approach called [Generalization]
which is perfect for statictics and data analytics.
Finally the extension offers a panel of [detection] functions that will try to
guess which columns needs to be anonymized.
Finally, the extension offers a panel of [detection] functions that will try to
guess which columns need to be anonymized.
[INSTALL.md]: INSTALL/
[Concepts]: concepts/
......@@ -39,6 +39,8 @@ guess which columns needs to be anonymized.
[Static Masking]: static_masking/
[Dynamic Masking]: dynamic_masking/
[Masking Functions]: masking_functions/
[Generalization]: generalization/
[detection]: detection/
......@@ -79,7 +81,8 @@ Step 3 : Declare the masking rules
Step 4 : Connect with the masked user
```sql
=# \! psql peopledb -U skynet -c 'SELECT * FROM people;'
=# \c - skynet
=> SELECT * FROM people;
id | fistname | lastname | phone
----+----------+-----------+------------
T1 | Sarah | Stranahan | 06******11
......@@ -89,9 +92,9 @@ Step 4 : Connect with the masked user
Warning
------------------------------------------------------------------------------
> *This is projet is at an early stage of development and should used carefully.*
> *This projet is at an early stage of development, and should be used carefully.*
We need your feedback and ideas ! Let us know what you think of this tool,how it
We need your feedback and ideas ! Let us know what you think of this tool, how it
fits your needs and what features are missing.
You can either [open an issue] or send a message at <contact@dalibo.com>.
......
......@@ -22,6 +22,7 @@ The extension provides functions to implement 8 main anonymization strategies:
[Generic Hashing]: #generic-hashing
[Partial scrambling]: #partial-scrambling
[Generalization]: #generalization
[Shuffling]: /static_masking#shuffling
Depending on your data, you may need to use different strategies on different
columns :
......@@ -38,8 +39,8 @@ Destruction
First of all, the fastest and safest way to anonymize a data is to destroy it
:-)
In many case, the best approach to hide the content of a column is to replace
all values with a single static value.
In many cases, the best approach to hide the content of a column is to replace
all the values with a single static value.
For instance, you can replace a entire column by the word 'CONFIDENTIAL' like
this:
......@@ -119,7 +120,7 @@ names, cities, etc. ).
If you want to use your own dataset, you can import custom CSV files with :
```sql
SELECT init('/path/to/custom_cvs_files/')
SELECT init('/path/to/custom_csv_files/')
```
Once the fake data is loaded, you have access to 12 faking functions:
......@@ -216,6 +217,13 @@ SECURITY LABEL FOR anon
IS 'MASKED WITH FUNCTION anon.pseudo_email(users.login) ';
```
**NOTE** : You may want to produce unique values using a pseudonymization
function. For instance, if you want to mask an `email` column that is declared
as `UNIQUE`. In this case, you will need to intialize the extension with a fake
dataset that is **way bigger** than the numbers of rows of the table. Otherwise you
may see some "collisions" happening, i.e. two different original values producing
the same pseudo value.
**WARNING** : Pseudonymization is often confused with anonymization but in fact
they serve 2 different purposes. With pseudonymization, the real data can be
rebuild using the pseudo data, the masking rules and the seed. If an attacker
......@@ -226,7 +234,7 @@ dataset. The GDPR makes it very clear that personal data which have undergone
pseudonymization are still considered to be personnal information (see [Recital 26])
In a nutshell: pseudonymization may be usefull in some use cases. But if your
goal is to escape from GDPR or similar data regulation, it is clearly a bad solution.
goal is to comply with GDPR or similar data regulation, it is clearly a bad solution.
[Recital 26]: https://www.privacy-regulation.eu/en/recital-26-GDPR.htm
......@@ -247,12 +255,13 @@ relatively unusual source data. Therefore, the
* `anon.hash(value)` will return a text hash of the value using a secret salt
and a secret hash algorithm (see below)
* `anon.digest(value,salt,algorithm)` lets choose a salt and the hash algorithm
you want to use
* `anon.digest(value,salt,algorithm)` lets you choose a salt, and a hash algorithm
from a pre-defined list
By default a random secret salt is generated when the extension is initialiazed
anf the default hash algortihm is `sha512`. You can change for the entire
database with to functions
By default, a random secret salt is generated when the extension is
initialiazed,
and the default hash algortihm is `sha512`. You can change these for the entire
database with two functions:
* `anon.set_secret_salt(value)` to define you own salt
* `anon.set_secret_algorithm(value)` to select another hash functuon.
......@@ -312,12 +321,12 @@ For instance : a credit card number can be replaced by '40XX XXXX XXXX XX96'.
Generalization
-------------------------------------------------------------------------------
Genelization is the principle of replace the original value by a range
Generalization is the principle of replacing the original value by a range
containing this values. For instance, instead of saying 'Paul is 42 years old',
you would can say 'Paul is between 40 and 50 years old.
you would say 'Paul is between 40 and 50 years old'.
> The generalization functions are a data type transformation. Therefore it is
> not possible to use them with the dynamic masking engine. Hower they are
> not possible to use them with the dynamic masking engine. However they are
> useful to create anonymized views. See example below
Let's imagine a table containing health information
......@@ -344,14 +353,14 @@ this:
```sql
CREATE VIEW anonymized_patient AS
SELECT
'REDACTED' AS name,
'REDACTED' AS lastname,
anon.generalize_int4range(zipcode,100) AS zipcode,
anon.generalize_tsrange(birth,'decade') AS birth
disease
FROM patients;
```
The anonymized table now look like that:
The anonymized table now looks like that:
```sql
SELECT * FROM anonymized_patient;
......@@ -370,7 +379,8 @@ SELECT * FROM anonymized_patient;
The generalized values are still useful for statistics because they remain
true but they are less accurante therefore reduce the risk of re-identification.
true, but they are less accurate, and therefore reduce the risk of
re-identification.
PostgreSQL offers several [RANGE] data types which are perfect for dates and
numeric values.
......@@ -381,7 +391,7 @@ For numeric values, 3 functions are available
* `generalize_int8range(value, step)`
* `generalize_numrange(value, step)`
...where `value` is the data the will be generalized, `step` is the size of
...where `value` is the data that will be generalized, and `step` is the size of
each range.
......@@ -391,7 +401,7 @@ each range.
Write your own Masks !
------------------------------------------------------------------------------
You can also use you own functions as a mask. The function must either be
You can also use your own function as a mask. The function must either be
destructive (like [Partial Scrambling]) or insert some randomness in the dataset
(like [Faking]).
......@@ -443,7 +453,7 @@ SELECT jsonb_pretty(info) FROM company WHERE business_name = 'Soylent Green';
```
Using the [PostgreSQL JSON functions and operators], you can walk
through the keys and replace the sensible values as needed.
through the keys and replace the sensitive values as needed.
[PostgreSQL JSON functions and operators]: https://www.postgresql.org/docs/current/functions-json.html
......@@ -504,9 +514,9 @@ And try it out !
(1 row)
```
This is just a quick and dirty example. As you can see manipulating a
sophiticated JSON structure with SQL is possible but it can be tricky at
This is just a quick and dirty example. As you can see, manipulating a
sophiticated JSON structure with SQL is possible, but it can be tricky at
first! There are multiple ways of walking through the keys and updating
values. You will probably have to try different approaches depending on
your real JSON data and the performance you want ot reach.
values. You will probably have to try different approaches, depending on
your real JSON data and the performance you want to reach.
......@@ -2,15 +2,15 @@ Permanently remove sensitive data
===============================================================================
Sometimes, it is usefull to transform directly the original dataset. You can
do that in with different methods:
do that with different methods:
* [Applying masking Rules]
* [Shuffling a column]
* [Adding noise to a column]
* [Applying masking rules]: #applying-masking-rules
* [Shuffling a column]: #shuffling
* [Adding noise to a column]: #adding-noise-to-a-column
These methods will destroy the original data. Use with care.
Applying masking Rules
Applying masking rules
--------------------------------------------------------------------------------
You can permanently apply the [masking rules] of a database with
......@@ -110,8 +110,8 @@ Shuffling