diff --git a/README.md b/README.md index 5cbb6e20..bb807ccc 100644 --- a/README.md +++ b/README.md @@ -302,6 +302,18 @@ If a column has multiple jsonschema types, the following order is using to order - BOOLEAN - NOTYPE +### `x-sql-datatype` extension + +This target supports the [`x-sql-datatype` extension](https://sdk.meltano.com/en/latest/guides/sql-target.html#use-the-x-sql-datatype-json-schema-extension) to the JSON schema. This extension allows you to specify the Postgres data type that should be used for a given field. This can be useful when the default mapping is not what you want. + + + +| `x-sql-datatype` | Postgres | Description | +| :--------------- | :------- | :----------------------------------------------------------------- | +| smallint | smallint | small-range integer (-32768 to +32767) | +| integer | integer | typical choice for integer (-2147483648 to +2147483647) | +| bigint | bigint | large-range integer (-9223372036854775808 to +9223372036854775807) | + ### Using the Singer catalog to narrow down the Postgres data types You can use [Singer catalog's schema](https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#schemas) to override the data types coming from the tap. The easiest way to do this is to use Meltano and its [`schema` setting](https://docs.meltano.com/concepts/plugins/#schema-extra) for the tap: @@ -320,6 +332,20 @@ plugins: maximum: 1000 ``` +Or to use the `x-sql-datatype` extension: + +```yaml +# meltano.yml +plugins: + extractors: + - name: tap-my-tap + schema: + some_stream_id: + my_column: + type: integer + x-sql-datatype: smallint +``` + ## Content Encoding Support Json Schema supports the [`contentEncoding` keyword](https://datatracker.ietf.org/doc/html/rfc4648#section-8), which can be used to specify the encoding of input string types. diff --git a/meltano.yml b/meltano.yml index 4d77006b..593bef80 100644 --- a/meltano.yml +++ b/meltano.yml @@ -5,7 +5,7 @@ project_id: target-postgres plugins: extractors: - name: tap-smoke-test - namespace: tap_smoke_test + variant: meltano pip_url: git+https://github.com/meltano/tap-smoke-test.git executable: tap-smoke-test config: @@ -19,6 +19,11 @@ plugins: __key_properties__: [id] page_views: __key_properties__: [vistor_id] + schema: + animals: + views: + type: integer + x-sql-datatype: smallint - name: tap-github variant: meltanolabs pip_url: git+https://github.com/MeltanoLabs/tap-github.git diff --git a/plugins/extractors/tap-github--meltanolabs.lock b/plugins/extractors/tap-github--meltanolabs.lock index 98544871..6357c1aa 100644 --- a/plugins/extractors/tap-github--meltanolabs.lock +++ b/plugins/extractors/tap-github--meltanolabs.lock @@ -6,11 +6,12 @@ "label": "GitHub", "docs": "https://hub.meltano.com/extractors/tap-github--meltanolabs", "repo": "https://github.com/MeltanoLabs/tap-github", - "pip_url": "git+https://github.com/MeltanoLabs/tap-github.git", + "pip_url": "meltanolabs-tap-github", "description": "Code hosting platform", "logo_url": "https://hub.meltano.com/assets/logos/extractors/github.png", "capabilities": [ "about", + "batch", "catalog", "discover", "schema-flattening", @@ -41,22 +42,90 @@ "label": "Additional Auth Tokens", "description": "List of GitHub tokens to authenticate with. Streams will loop through them when hitting rate limits." }, + { + "name": "auth_app_keys", + "kind": "array", + "label": "Auth App Keys", + "description": "List of GitHub App credentials to authenticate with. Each credential can be constructed by combining an App ID and App private key into the format `:app_id:;;-----BEGIN RSA PRIVATE KEY----- _YOUR_P_KEY_ -----END RSA PRIVATE KEY-----`." + }, { "name": "auth_token", - "kind": "password", + "kind": "string", "label": "Auth Token", - "description": "GitHub token to authenticate with." + "description": "GitHub token to authenticate with.", + "sensitive": true + }, + { + "name": "batch_config.encoding.compression", + "kind": "options", + "label": "Batch Compression Format", + "description": "Compression format to use for batch files.", + "options": [ + { + "label": "GZIP", + "value": "gzip" + }, + { + "label": "None", + "value": "none" + } + ] + }, + { + "name": "batch_config.encoding.format", + "kind": "options", + "label": "Batch Encoding Format", + "description": "Format to use for batch files.", + "options": [ + { + "label": "JSONL", + "value": "jsonl" + }, + { + "label": "Parquet", + "value": "parquet" + } + ] + }, + { + "name": "batch_config.storage.prefix", + "kind": "string", + "label": "Batch Storage Prefix", + "description": "Prefix to use when writing batch files." + }, + { + "name": "batch_config.storage.root", + "kind": "string", + "label": "Batch Storage Root", + "description": "Root path to use when writing batch files." + }, + { + "name": "expiry_time_buffer", + "kind": "integer", + "label": "Expiry Time Buffer" + }, + { + "name": "faker_config.locale", + "kind": "array", + "label": "Faker Locale", + "description": "One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization" + }, + { + "name": "faker_config.seed", + "kind": "string", + "label": "Faker Seed", + "description": "Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator" }, { "name": "flattening_enabled", "kind": "boolean", - "label": "Flattening Enabled", + "label": "Enable Schema Flattening", "description": "'True' to enable schema flattening and automatically expand nested properties." }, { "name": "flattening_max_depth", "kind": "integer", - "label": "Flattening Max Depth", + "label": "Max Flattening Depth", "description": "The max depth to flatten schemas." }, { @@ -110,6 +179,27 @@ "kind": "object", "label": "Stream Maps" }, + { + "name": "stream_options.milestones.state", + "kind": "options", + "value": "open", + "label": "Stream Options Milestones State", + "description": "Configures which states are of interest. Must be one of [open, closed, all], defaults to open.", + "options": [ + { + "label": "Open", + "value": "open" + }, + { + "label": "Closed", + "value": "closed" + }, + { + "label": "All", + "value": "all" + } + ] + }, { "name": "user_agent", "kind": "string", diff --git a/plugins/extractors/tap-smoke-test--meltano.lock b/plugins/extractors/tap-smoke-test--meltano.lock new file mode 100644 index 00000000..3e5ec107 --- /dev/null +++ b/plugins/extractors/tap-smoke-test--meltano.lock @@ -0,0 +1,122 @@ +{ + "plugin_type": "extractors", + "name": "tap-smoke-test", + "namespace": "tap_smoke_test", + "variant": "meltano", + "label": "Smoke Test", + "docs": "https://hub.meltano.com/extractors/tap-smoke-test--meltano", + "repo": "https://github.com/meltano/tap-smoke-test", + "pip_url": "git+https://github.com/meltano/tap-smoke-test.git", + "executable": "tap-smoke-test", + "description": "Generates sample data to be used for testing.", + "logo_url": "https://hub.meltano.com/assets/logos/extractors/smoke-test.png", + "capabilities": [ + "about", + "batch", + "catalog", + "discover", + "schema-flattening", + "state", + "stream-maps" + ], + "settings_group_validation": [ + [ + "streams" + ] + ], + "settings": [ + { + "name": "batch_config.encoding.compression", + "kind": "options", + "label": "Batch Compression Format", + "description": "Compression format to use for batch files.", + "options": [ + { + "label": "GZIP", + "value": "gzip" + }, + { + "label": "None", + "value": "none" + } + ] + }, + { + "name": "batch_config.encoding.format", + "kind": "options", + "label": "Batch Encoding Format", + "description": "Format to use for batch files.", + "options": [ + { + "label": "JSONL", + "value": "jsonl" + }, + { + "label": "Parquet", + "value": "parquet" + } + ] + }, + { + "name": "batch_config.storage.prefix", + "kind": "string", + "label": "Batch Storage Prefix", + "description": "Prefix to use when writing batch files." + }, + { + "name": "batch_config.storage.root", + "kind": "string", + "label": "Batch Storage Root", + "description": "Root path to use when writing batch files." + }, + { + "name": "faker_config.locale", + "kind": "array", + "label": "Faker Locale", + "description": "One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization" + }, + { + "name": "faker_config.seed", + "kind": "string", + "label": "Faker Seed", + "description": "Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator" + }, + { + "name": "flattening_enabled", + "kind": "boolean", + "label": "Enable Schema Flattening", + "description": "'True' to enable schema flattening and automatically expand nested properties." + }, + { + "name": "flattening_max_depth", + "kind": "integer", + "label": "Max Flattening Depth", + "description": "The max depth to flatten schemas." + }, + { + "name": "schema_inference_record_count", + "kind": "integer", + "value": 5, + "label": "Schema Inference Record Count", + "description": "How many records of the source data should be used for schema inference/construction." + }, + { + "name": "stream_map_config", + "kind": "object", + "label": "User Stream Map Configuration", + "description": "User-defined config values to be used within map expressions." + }, + { + "name": "stream_maps", + "kind": "object", + "label": "Stream Maps", + "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)." + }, + { + "name": "streams", + "kind": "array", + "label": "Streams", + "description": "An array of objects containing:\n* `stream_name`: The name of the stream.\n* `input_filename`: Path to a jsonl file containing records to use for mock data.\n* `client_exception`: (Default False) Whether we should simulate failing by having the client raise an exception.\n* `schema_gen_exception`: (Default False) Whether we should simulate failing by raising an exception during schema inference.\n* `loop_count`: (Default 1) The number of times we should playback the input file.\n\nFor example:\n\n```yaml\nstreams:\n- stream_name: animals\n input_filename: https://raw.githubusercontent.com/meltano/tap-smoke-test/main/demo-data/animals-data.jsonl\n```\n" + } + ] +} diff --git a/target_postgres/connector.py b/target_postgres/connector.py index 228a3f0a..75bfa6f6 100644 --- a/target_postgres/connector.py +++ b/target_postgres/connector.py @@ -310,6 +310,9 @@ def jsonschema_to_sql(self) -> JSONSchemaToSQL: to_sql.register_format_handler("hostname", TEXT) to_sql.register_format_handler("ipv4", TEXT) to_sql.register_format_handler("ipv6", TEXT) + to_sql.register_sql_datatype_handler("smallint", SMALLINT) + to_sql.register_sql_datatype_handler("integer", INTEGER) + to_sql.register_sql_datatype_handler("bigint", BIGINT) return to_sql def to_sql_type(self, jsonschema_type: dict) -> sa.types.TypeEngine: diff --git a/target_postgres/tests/test_types.py b/target_postgres/tests/test_types.py index 065ca327..436aa8d5 100644 --- a/target_postgres/tests/test_types.py +++ b/target_postgres/tests/test_types.py @@ -93,6 +93,14 @@ def test_datetime_string(self, to_postgres: JSONSchemaToPostgres): BIGINT, id="bigint", ), + pytest.param( + { + "type": "integer", + "x-sql-datatype": "smallint", + }, + SMALLINT, + id="x-sql-datatype-smallint", + ), ], ) def test_integers(