From a23ceb2510444c78ee7904038f011b7c797cb925 Mon Sep 17 00:00:00 2001 From: SpaceCondor Date: Wed, 18 Sep 2024 10:20:56 -0400 Subject: [PATCH 1/3] Add sanitize_null_text_characters option --- README.md | 57 ++++++++++++++++++------------------ target_postgres/connector.py | 9 ++++++ target_postgres/sinks.py | 35 ++++++++++++++++++++-- target_postgres/target.py | 10 +++++++ 4 files changed, 80 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 176e3620..89420061 100644 --- a/README.md +++ b/README.md @@ -20,34 +20,35 @@ Built with the [Meltano SDK](https://sdk.meltano.com) for Singer Taps and Target ## Settings -| Setting | Required | Default | Description | -| :------------------------------ | :------- | :---------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| host | False | None | Hostname for postgres instance. Note if sqlalchemy_url is set this will be ignored. | -| port | False | 5432 | The port on which postgres is awaiting connection. Note if sqlalchemy_url is set this will be ignored. | -| user | False | None | User name used to authenticate. Note if sqlalchemy_url is set this will be ignored. | -| password | False | None | Password used to authenticate. Note if sqlalchemy_url is set this will be ignored. | -| database | False | None | Database name. Note if sqlalchemy_url is set this will be ignored. | -| sqlalchemy_url | False | None | SQLAlchemy connection string. This will override using host, user, password, port, dialect, and all ssl settings. Note that you must escape password special characters properly. See https://docs.sqlalchemy.org/en/20/core/engines.html#escaping-special-characters-such-as-signs-in-passwords | -| dialect+driver | False | postgresql+psycopg2 | Dialect+driver see https://docs.sqlalchemy.org/en/20/core/engines.html. Generally just leave this alone. Note if sqlalchemy_url is set this will be ignored. | -| default_target_schema | False | melty | Postgres schema to send data to, example: tap-clickup | -| activate_version | False | 1 | If set to false, the tap will ignore activate version messages. If set to true, add_record_metadata must be set to true as well. | -| hard_delete | False | 0 | When activate version is sent from a tap this specefies if we should delete the records that don't match, or mark them with a date in the `_sdc_deleted_at` column. This config option is ignored if `activate_version` is set to false. | -| add_record_metadata | False | 1 | Note that this must be enabled for activate_version to work!This adds _sdc_extracted_at, _sdc_batched_at, and more to every table. See https://sdk.meltano.com/en/latest/implementation/record_metadata.html for more information. | -| interpret_content_encoding | False | 0 | If set to true, the target will interpret the content encoding of the schema to determine how to store the data. Using this option may result in a more efficient storage of the data but may also result in an error if the data is not encoded as expected. | -| ssl_enable | False | 0 | Whether or not to use ssl to verify the server's identity. Use ssl_certificate_authority and ssl_mode for further customization. To use a client certificate to authenticate yourself to the server, use ssl_client_certificate_enable instead. Note if sqlalchemy_url is set this will be ignored. | -| ssl_client_certificate_enable | False | 0 | Whether or not to provide client-side certificates as a method of authentication to the server. Use ssl_client_certificate and ssl_client_private_key for further customization. To use SSL to verify the server's identity, use ssl_enable instead. Note if sqlalchemy_url is set this will be ignored. | -| ssl_mode | False | verify-full | SSL Protection method, see [postgres documentation](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-PROTECTION) for more information. Must be one of disable, allow, prefer, require, verify-ca, or verify-full. Note if sqlalchemy_url is set this will be ignored. | -| ssl_certificate_authority | False | ~/.postgresql/root.crl | The certificate authority that should be used to verify the server's identity. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. Note if sqlalchemy_url is set this will be ignored. | -| ssl_client_certificate | False | ~/.postgresql/postgresql.crt | The certificate that should be used to verify your identity to the server. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. Note if sqlalchemy_url is set this will be ignored. | -| ssl_client_private_key | False | ~/.postgresql/postgresql.key | The private key for the certificate you provided. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. Note if sqlalchemy_url is set this will be ignored. | -| ssl_storage_directory | False | .secrets | The folder in which to store SSL certificates provided as raw values. When a certificate/key is provided as a raw value instead of as a filepath, it must be written to a file before it can be used. This configuration option determines where that file is created. | -| ssh_tunnel | False | None | SSH Tunnel Configuration, this is a json object | -| ssh_tunnel.enable | False | 0 | Enable an ssh tunnel (also known as bastion host), see the other ssh_tunnel.* properties for more details | -| ssh_tunnel.host | False | None | Host of the bastion host, this is the host we'll connect to via ssh | -| ssh_tunnel.username | False | None | Username to connect to bastion host | -| ssh_tunnel.port | False | 22 | Port to connect to bastion host | -| ssh_tunnel.private_key | False | None | Private Key for authentication to the bastion host | -| ssh_tunnel.private_key_password | False | None | Private Key Password, leave None if no password is set | +| Setting | Required | Default | Description | +|:----------------------------------| :------- | :---------------------------- |:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| host | False | None | Hostname for postgres instance. Note if sqlalchemy_url is set this will be ignored. | +| port | False | 5432 | The port on which postgres is awaiting connection. Note if sqlalchemy_url is set this will be ignored. | +| user | False | None | User name used to authenticate. Note if sqlalchemy_url is set this will be ignored. | +| password | False | None | Password used to authenticate. Note if sqlalchemy_url is set this will be ignored. | +| database | False | None | Database name. Note if sqlalchemy_url is set this will be ignored. | +| sqlalchemy_url | False | None | SQLAlchemy connection string. This will override using host, user, password, port, dialect, and all ssl settings. Note that you must escape password special characters properly. See https://docs.sqlalchemy.org/en/20/core/engines.html#escaping-special-characters-such-as-signs-in-passwords | +| dialect+driver | False | postgresql+psycopg2 | Dialect+driver see https://docs.sqlalchemy.org/en/20/core/engines.html. Generally just leave this alone. Note if sqlalchemy_url is set this will be ignored. | +| default_target_schema | False | melty | Postgres schema to send data to, example: tap-clickup | +| activate_version | False | 1 | If set to false, the tap will ignore activate version messages. If set to true, add_record_metadata must be set to true as well. | +| hard_delete | False | 0 | When activate version is sent from a tap this specefies if we should delete the records that don't match, or mark them with a date in the `_sdc_deleted_at` column. This config option is ignored if `activate_version` is set to false. | +| add_record_metadata | False | 1 | Note that this must be enabled for activate_version to work!This adds _sdc_extracted_at, _sdc_batched_at, and more to every table. See https://sdk.meltano.com/en/latest/implementation/record_metadata.html for more information. | +| interpret_content_encoding | False | 0 | If set to true, the target will interpret the content encoding of the schema to determine how to store the data. Using this option may result in a more efficient storage of the data but may also result in an error if the data is not encoded as expected. | +| sanitize_null_text_characters | False | 0 | If set to true, the target will sanitize null characters in char/text/varchar fields, as they are not supported by Postgres. See [postgres documentation](https://www.postgresql.org/docs/current/functions-string.html) for more information about chr(0) not being supported. | +| ssl_enable | False | 0 | Whether or not to use ssl to verify the server's identity. Use ssl_certificate_authority and ssl_mode for further customization. To use a client certificate to authenticate yourself to the server, use ssl_client_certificate_enable instead. Note if sqlalchemy_url is set this will be ignored. | +| ssl_client_certificate_enable | False | 0 | Whether or not to provide client-side certificates as a method of authentication to the server. Use ssl_client_certificate and ssl_client_private_key for further customization. To use SSL to verify the server's identity, use ssl_enable instead. Note if sqlalchemy_url is set this will be ignored. | +| ssl_mode | False | verify-full | SSL Protection method, see [postgres documentation](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-PROTECTION) for more information. Must be one of disable, allow, prefer, require, verify-ca, or verify-full. Note if sqlalchemy_url is set this will be ignored. | +| ssl_certificate_authority | False | ~/.postgresql/root.crl | The certificate authority that should be used to verify the server's identity. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. Note if sqlalchemy_url is set this will be ignored. | +| ssl_client_certificate | False | ~/.postgresql/postgresql.crt | The certificate that should be used to verify your identity to the server. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. Note if sqlalchemy_url is set this will be ignored. | +| ssl_client_private_key | False | ~/.postgresql/postgresql.key | The private key for the certificate you provided. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. Note if sqlalchemy_url is set this will be ignored. | +| ssl_storage_directory | False | .secrets | The folder in which to store SSL certificates provided as raw values. When a certificate/key is provided as a raw value instead of as a filepath, it must be written to a file before it can be used. This configuration option determines where that file is created. | +| ssh_tunnel | False | None | SSH Tunnel Configuration, this is a json object | +| ssh_tunnel.enable | False | 0 | Enable an ssh tunnel (also known as bastion host), see the other ssh_tunnel.* properties for more details | +| ssh_tunnel.host | False | None | Host of the bastion host, this is the host we'll connect to via ssh | +| ssh_tunnel.username | False | None | Username to connect to bastion host | +| ssh_tunnel.port | False | 22 | Port to connect to bastion host | +| ssh_tunnel.private_key | False | None | Private Key for authentication to the bastion host | +| ssh_tunnel.private_key_password | False | None | Private Key Password, leave None if no password is set | A full list of supported settings and capabilities is available by running: `target-postgres --about` diff --git a/target_postgres/connector.py b/target_postgres/connector.py index 547055b2..aad6b728 100644 --- a/target_postgres/connector.py +++ b/target_postgres/connector.py @@ -95,6 +95,15 @@ def interpret_content_encoding(self) -> bool: """ return self.config.get("interpret_content_encoding", False) + @cached_property + def sanitize_null_text_characters(self) -> bool: + """Whether to sanitize null text characters. + + Returns: + True if the feature is enabled, False otherwise. + """ + return self.config.get("sanitize_null_text_characters", False) + def prepare_table( # type: ignore[override] self, full_table_name: str | FullyQualifiedName, diff --git a/target_postgres/sinks.py b/target_postgres/sinks.py index 39332b63..1f40fe38 100644 --- a/target_postgres/sinks.py +++ b/target_postgres/sinks.py @@ -12,6 +12,7 @@ from sqlalchemy.sql.expression import bindparam from target_postgres.connector import PostgresConnector +from target_postgres.tests.test_types import connector if t.TYPE_CHECKING: from singer_sdk.connectors.sql import FullyQualifiedName @@ -121,6 +122,28 @@ def generate_temp_table_name(self): # in postgres, used a guid just in case we are using the same session return f"{str(uuid.uuid4()).replace('-', '_')}" + def sanitize_null_text_characters(self, data): + """Sanitizes null characters by replacing \u0000 with \uFFFD""" + + def replace_null_character(d): + return d.replace('\u0000', '\uFFFD') + + if isinstance(data, str): + data = replace_null_character(data) + + elif isinstance(data, dict): + for k in data: + if isinstance(data[k], str): + data[k] = replace_null_character(data[k]) + + elif isinstance(data, list): + for i in range(0, len(data)): + if isinstance(data[i], str): + data[i] = replace_null_character(data[i]) + + return data + + def bulk_insert_records( # type: ignore[override] self, table: sa.Table, @@ -163,9 +186,12 @@ def bulk_insert_records( # type: ignore[override] for record in records: insert_record = {} for column in columns: - insert_record[column.name] = record.get(column.name) + if self.connector.sanitize_null_text_characters: + insert_record[column.name] = self.sanitize_null_text_characters(record.get(column.name)) + else: + insert_record[column.name] = record.get(column.name) # No need to check for a KeyError here because the SDK already - # guaruntees that all key properties exist in the record. + # guarantees that all key properties exist in the record. primary_key_value = "".join([str(record[key]) for key in primary_keys]) insert_records[primary_key_value] = insert_record data_to_insert = list(insert_records.values()) @@ -173,7 +199,10 @@ def bulk_insert_records( # type: ignore[override] for record in records: insert_record = {} for column in columns: - insert_record[column.name] = record.get(column.name) + if self.connector.sanitize_null_text_characters: + insert_record[column.name] = self.sanitize_null_text_characters(record.get(column.name)) + else: + insert_record[column.name] = record.get(column.name) data_to_insert.append(insert_record) connection.execute(insert, data_to_insert) return True diff --git a/target_postgres/target.py b/target_postgres/target.py index 809dbd86..baa2be62 100644 --- a/target_postgres/target.py +++ b/target_postgres/target.py @@ -203,6 +203,16 @@ def __init__( "in an error if the data is not encoded as expected." ), ), + th.Property( + "sanitize_null_text_characters", + th.BooleanType, + default=False, + description=( + "If set to true, the target will sanitize null characters in char/text/varchar fields, as they " + "are not supported by Postgres. See [postgres documentation](https://www.postgresql.org/docs/current/functions-string.html) " + "for more information about chr(0) not being supported." + ), + ), th.Property( "ssl_enable", th.BooleanType, From 496e292bc45124ac3e1e8539b35b18e608e40be0 Mon Sep 17 00:00:00 2001 From: SpaceCondor Date: Wed, 18 Sep 2024 13:35:32 -0400 Subject: [PATCH 2/3] Remove test import --- target_postgres/sinks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/target_postgres/sinks.py b/target_postgres/sinks.py index 1f40fe38..6a34b09a 100644 --- a/target_postgres/sinks.py +++ b/target_postgres/sinks.py @@ -12,7 +12,6 @@ from sqlalchemy.sql.expression import bindparam from target_postgres.connector import PostgresConnector -from target_postgres.tests.test_types import connector if t.TYPE_CHECKING: from singer_sdk.connectors.sql import FullyQualifiedName From eece403a97eab9870ea9b2c498d037aff73069e6 Mon Sep 17 00:00:00 2001 From: SpaceCondor Date: Wed, 18 Sep 2024 13:46:37 -0400 Subject: [PATCH 3/3] Minor reformatting --- target_postgres/sinks.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/target_postgres/sinks.py b/target_postgres/sinks.py index 6a34b09a..5a299799 100644 --- a/target_postgres/sinks.py +++ b/target_postgres/sinks.py @@ -122,10 +122,10 @@ def generate_temp_table_name(self): return f"{str(uuid.uuid4()).replace('-', '_')}" def sanitize_null_text_characters(self, data): - """Sanitizes null characters by replacing \u0000 with \uFFFD""" + """Sanitizes null characters by replacing \u0000 with \ufffd""" def replace_null_character(d): - return d.replace('\u0000', '\uFFFD') + return d.replace("\u0000", "\ufffd") if isinstance(data, str): data = replace_null_character(data) @@ -142,7 +142,6 @@ def replace_null_character(d): return data - def bulk_insert_records( # type: ignore[override] self, table: sa.Table, @@ -186,7 +185,9 @@ def bulk_insert_records( # type: ignore[override] insert_record = {} for column in columns: if self.connector.sanitize_null_text_characters: - insert_record[column.name] = self.sanitize_null_text_characters(record.get(column.name)) + insert_record[column.name] = self.sanitize_null_text_characters( + record.get(column.name) + ) else: insert_record[column.name] = record.get(column.name) # No need to check for a KeyError here because the SDK already @@ -199,7 +200,9 @@ def bulk_insert_records( # type: ignore[override] insert_record = {} for column in columns: if self.connector.sanitize_null_text_characters: - insert_record[column.name] = self.sanitize_null_text_characters(record.get(column.name)) + insert_record[column.name] = self.sanitize_null_text_characters( + record.get(column.name) + ) else: insert_record[column.name] = record.get(column.name) data_to_insert.append(insert_record)