Skip to content

Commit 98cef12

Browse files
feat: Support integer types other than BIGINT (#485)
TODO: - [x] Docs, specifically updating the `Data Types` section of the readme, and how to override the schema to force the target into a certain integer type - [x] Tests
1 parent f77971c commit 98cef12

File tree

3 files changed

+154
-47
lines changed

3 files changed

+154
-47
lines changed

README.md

Lines changed: 63 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -238,51 +238,51 @@ develop your own Singer taps and targets.
238238

239239
The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes.
240240

241-
| jsonschema | Postgres |
242-
|--------------------------------|-----------------------------------------|
243-
| integer | bigint |
244-
| UNSUPPORTED | bigserial |
245-
| UNSUPPORTED | bit [ (n) ] |
246-
| UNSUPPORTED | bit varying [ (n) ] |
247-
| boolean | boolean |
248-
| UNSUPPORTED | box |
241+
| jsonschema | Postgres |
242+
| ---------------------------------------------------------------------------------- | --------------------------------------- |
243+
| integer | bigint |
244+
| integer with minimum >= 32768 or maximum < 32768 | smallint |
245+
| integer with minimum >= 2147483648 or maximum < 2147483648 | integer |
246+
| UNSUPPORTED | bigserial |
247+
| UNSUPPORTED | bit [ (n) ] |
248+
| UNSUPPORTED | bit varying [ (n) ] |
249+
| boolean | boolean |
250+
| UNSUPPORTED | box |
249251
| string with contentEncoding="base16" ([opt-in feature](#content-encoding-support)) | bytea |
250-
| UNSUPPORTED | character [ (n) ] |
251-
| UNSUPPORTED | character varying [ (n) ] |
252-
| UNSUPPORTED | cidr |
253-
| UNSUPPORTED | circle |
254-
| string with format="date" | date |
255-
| UNSUPPORTED | double precision |
256-
| UNSUPPORTED | inet |
257-
| UNSUPPORTED | integer |
258-
| UNSUPPORTED | interval [ fields ] [ (p) ] |
259-
| UNSUPPORTED | json |
260-
| array; object | jsonb |
261-
| UNSUPPORTED | line |
262-
| UNSUPPORTED | lseg |
263-
| UNSUPPORTED | macaddr |
264-
| UNSUPPORTED | macaddr8 |
265-
| UNSUPPORTED | money |
266-
| number | numeric [ (p, s) ] |
267-
| UNSUPPORTED | path |
268-
| UNSUPPORTED | pg_lsn |
269-
| UNSUPPORTED | pg_snapshot |
270-
| UNSUPPORTED | point |
271-
| UNSUPPORTED | polygon |
272-
| UNSUPPORTED | real |
273-
| UNSUPPORTED | smallint |
274-
| UNSUPPORTED | smallserial |
275-
| UNSUPPORTED | serial |
276-
| string without format; untyped | text |
277-
| string with format="time" | time [ (p) ] [ without time zone ] |
278-
| UNSUPPORTED | time [ (p) ] with time zone |
279-
| string with format="date-time" | timestamp [ (p) ] [ without time zone ] |
280-
| UNSUPPORTED | timestamp [ (p) ] with time zone |
281-
| UNSUPPORTED | tsquery |
282-
| UNSUPPORTED | tsvector |
283-
| UNSUPPORTED | txid_snapshot |
284-
| string with format="uuid" | uuid |
285-
| UNSUPPORTED | xml |
252+
| UNSUPPORTED | character [ (n) ] |
253+
| UNSUPPORTED | character varying [ (n) ] |
254+
| UNSUPPORTED | cidr |
255+
| UNSUPPORTED | circle |
256+
| string with format="date" | date |
257+
| UNSUPPORTED | double precision |
258+
| UNSUPPORTED | inet |
259+
| UNSUPPORTED | interval [ fields ] [ (p) ] |
260+
| UNSUPPORTED | json |
261+
| array; object | jsonb |
262+
| UNSUPPORTED | line |
263+
| UNSUPPORTED | lseg |
264+
| UNSUPPORTED | macaddr |
265+
| UNSUPPORTED | macaddr8 |
266+
| UNSUPPORTED | money |
267+
| number | numeric [ (p, s) ] |
268+
| UNSUPPORTED | path |
269+
| UNSUPPORTED | pg_lsn |
270+
| UNSUPPORTED | pg_snapshot |
271+
| UNSUPPORTED | point |
272+
| UNSUPPORTED | polygon |
273+
| UNSUPPORTED | real |
274+
| UNSUPPORTED | smallserial |
275+
| UNSUPPORTED | serial |
276+
| string without format; untyped | text |
277+
| string with format="time" | time [ (p) ] [ without time zone ] |
278+
| UNSUPPORTED | time [ (p) ] with time zone |
279+
| string with format="date-time" | timestamp [ (p) ] [ without time zone ] |
280+
| UNSUPPORTED | timestamp [ (p) ] with time zone |
281+
| UNSUPPORTED | tsquery |
282+
| UNSUPPORTED | tsvector |
283+
| UNSUPPORTED | txid_snapshot |
284+
| string with format="uuid" | uuid |
285+
| UNSUPPORTED | xml |
286286

287287
Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array.
288288

@@ -298,9 +298,28 @@ If a column has multiple jsonschema types, the following order is using to order
298298
- DECIMAL
299299
- BIGINT
300300
- INTEGER
301+
- SMALLINT
301302
- BOOLEAN
302303
- NOTYPE
303304

305+
### Using the Singer catalog to narrow down the Postgres data types
306+
307+
You can use [Singer catalog's schema](https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#schemas) to override the data types coming from the tap. The easiest way to do this is to use Meltano and its [`schema` setting](https://docs.meltano.com/concepts/plugins/#schema-extra) for the tap:
308+
309+
```yaml
310+
# meltano.yml
311+
plugins:
312+
extractors:
313+
- name: tap-my-tap
314+
schema:
315+
some_stream_id:
316+
my_column:
317+
type: integer
318+
# This will be mapped to 'smallint'
319+
minimum: 0
320+
maximum: 1000
321+
```
322+
304323
## Content Encoding Support
305324

306325
Json Schema supports the [`contentEncoding` keyword](https://datatracker.ietf.org/doc/html/rfc4648#section-8), which can be used to specify the encoding of input string types.

target_postgres/connector.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import atexit
66
import io
77
import itertools
8+
import math
89
import signal
910
import sys
1011
import typing as t
@@ -18,7 +19,14 @@
1819
import sqlalchemy as sa
1920
from singer_sdk import SQLConnector
2021
from singer_sdk.connectors.sql import JSONSchemaToSQL
21-
from sqlalchemy.dialects.postgresql import ARRAY, BIGINT, BYTEA, JSONB, UUID
22+
from sqlalchemy.dialects.postgresql import (
23+
ARRAY,
24+
BIGINT,
25+
BYTEA,
26+
JSONB,
27+
SMALLINT,
28+
UUID,
29+
)
2230
from sqlalchemy.engine import URL
2331
from sqlalchemy.engine.url import make_url
2432
from sqlalchemy.types import (
@@ -273,6 +281,17 @@ def _handle_array_type(self, jsonschema: dict) -> ARRAY | JSONB:
273281
# Case 3: tuples
274282
return ARRAY(JSONB()) if isinstance(items, list) else JSONB()
275283

284+
def _handle_integer_type(self, jsonschema: dict) -> SMALLINT | INTEGER | BIGINT:
285+
"""Handle integer type."""
286+
minimum = jsonschema.get("minimum", -math.inf)
287+
maximum = jsonschema.get("maximum", math.inf)
288+
if minimum >= -(2**15) and maximum < 2**15:
289+
return SMALLINT()
290+
if minimum >= -(2**31) and maximum < 2**31:
291+
return INTEGER()
292+
293+
return BIGINT()
294+
276295
@cached_property
277296
def jsonschema_to_sql(self) -> JSONSchemaToSQL:
278297
"""Return a JSONSchemaToSQL instance with custom type handling."""
@@ -281,7 +300,7 @@ def jsonschema_to_sql(self) -> JSONSchemaToSQL:
281300
max_varchar_length=self.max_varchar_length,
282301
)
283302
to_sql.fallback_type = TEXT
284-
to_sql.register_type_handler("integer", BIGINT)
303+
to_sql.register_type_handler("integer", self._handle_integer_type)
285304
to_sql.register_type_handler("object", JSONB)
286305
to_sql.register_type_handler("array", self._handle_array_type)
287306
to_sql.register_format_handler("date-time", TIMESTAMP)
@@ -386,6 +405,7 @@ def pick_best_sql_type(sql_type_array: list):
386405
DECIMAL,
387406
BIGINT,
388407
INTEGER,
408+
SMALLINT,
389409
BOOLEAN,
390410
NOTYPE,
391411
]

target_postgres/tests/test_types.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22

33
import pytest
44
import sqlalchemy as sa
5+
from sqlalchemy.dialects.postgresql import BIGINT, SMALLINT
56

6-
from target_postgres.connector import NOTYPE, PostgresConnector
7+
from target_postgres.connector import NOTYPE, JSONSchemaToPostgres, PostgresConnector
78

89

910
@pytest.fixture
@@ -36,3 +37,70 @@ def connector():
3637
def test_type_hierarchy(connector, types, expected):
3738
"""Test that types are merged correctly."""
3839
assert type(connector.merge_sql_types(types)) is expected
40+
41+
42+
class TestJSONSchemaToPostgres:
43+
"""Test JSONSchemaToPostgres class."""
44+
45+
@pytest.fixture
46+
def to_postgres(self, connector: PostgresConnector):
47+
"""Create a JSONSchemaToPostgres instance."""
48+
return connector.jsonschema_to_sql
49+
50+
def test_datetime_string(self, to_postgres: JSONSchemaToPostgres):
51+
"""Test conversion of JSON schema string to Postgres datetime."""
52+
result = to_postgres.to_sql_type({"type": "string", "format": "date-time"})
53+
assert type(result) is sa.TIMESTAMP
54+
55+
@pytest.mark.parametrize(
56+
("jsonschema", "expected"),
57+
[
58+
pytest.param({"type": "integer"}, BIGINT, id="default"),
59+
pytest.param({"type": ["integer", "null"]}, BIGINT, id="default-nullable"),
60+
pytest.param(
61+
{
62+
"type": "integer",
63+
"minimum": 0,
64+
"maximum": 2**15 - 1,
65+
},
66+
SMALLINT,
67+
id="smallint",
68+
),
69+
pytest.param(
70+
{
71+
"type": "integer",
72+
"minimum": -5,
73+
"maximum": 5,
74+
},
75+
SMALLINT,
76+
id="negative-smallint",
77+
),
78+
pytest.param(
79+
{
80+
"type": "integer",
81+
"minimum": 0,
82+
"maximum": 2**31 - 1,
83+
},
84+
sa.INTEGER,
85+
id="integer",
86+
),
87+
pytest.param(
88+
{
89+
"type": "integer",
90+
"minimum": 0,
91+
"maximum": 2**31 + 1,
92+
},
93+
BIGINT,
94+
id="bigint",
95+
),
96+
],
97+
)
98+
def test_integers(
99+
self,
100+
to_postgres: JSONSchemaToPostgres,
101+
jsonschema: dict,
102+
expected: type[sa.types.TypeEngine],
103+
):
104+
"""Test conversion of JSON schema types to Postgres types."""
105+
result = to_postgres.to_sql_type(jsonschema)
106+
assert type(result) is expected

0 commit comments

Comments
 (0)