feat: Support integer types other than BIGINT (#485)

edgarrmondragon · web-flow · commit 98cef125bfe8 · 2025-01-27T17:25:48.000-06:00
TODO:

- [x] Docs, specifically updating the `Data Types` section of the
readme, and how to override the schema to force the target into a
certain integer type
- [x] Tests
diff --git a/README.md b/README.md
@@ -238,51 +238,51 @@ develop your own Singer taps and targets.
 
 The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes.
 
-| jsonschema                     | Postgres                                |
-|--------------------------------|-----------------------------------------|
-| integer                        | bigint                                  |
-| UNSUPPORTED                    | bigserial                               |
-| UNSUPPORTED                    | bit [ (n) ]                             |
-| UNSUPPORTED                    | bit varying [ (n) ]                     |
-| boolean                        | boolean                                 |
-| UNSUPPORTED                    | box                                     |
+| jsonschema                                                                         | Postgres                                |
+| ---------------------------------------------------------------------------------- | --------------------------------------- |
+| integer                                                                            | bigint                                  |
+| integer with minimum >= 32768 or maximum < 32768                                   | smallint                                |
+| integer with minimum >= 2147483648 or maximum < 2147483648                         | integer                                 |
+| UNSUPPORTED                                                                        | bigserial                               |
+| UNSUPPORTED                                                                        | bit [ (n) ]                             |
+| UNSUPPORTED                                                                        | bit varying [ (n) ]                     |
+| boolean                                                                            | boolean                                 |
+| UNSUPPORTED                                                                        | box                                     |
 | string with contentEncoding="base16" ([opt-in feature](#content-encoding-support)) | bytea                                   |
-| UNSUPPORTED                    | character [ (n) ]                       |
-| UNSUPPORTED                    | character varying [ (n) ]               |
-| UNSUPPORTED                    | cidr                                    |
-| UNSUPPORTED                    | circle                                  |
-| string with format="date"      | date                                    |
-| UNSUPPORTED                    | double precision                        |
-| UNSUPPORTED                    | inet                                    |
-| UNSUPPORTED                    | integer                                 |
-| UNSUPPORTED                    | interval [ fields ] [ (p) ]             |
-| UNSUPPORTED                    | json                                    |
-| array; object                  | jsonb                                   |
-| UNSUPPORTED                    | line                                    |
-| UNSUPPORTED                    | lseg                                    |
-| UNSUPPORTED                    | macaddr                                 |
-| UNSUPPORTED                    | macaddr8                                |
-| UNSUPPORTED                    | money                                   |
-| number                         | numeric [ (p, s) ]                      |
-| UNSUPPORTED                    | path                                    |
-| UNSUPPORTED                    | pg_lsn                                  |
-| UNSUPPORTED                    | pg_snapshot                             |
-| UNSUPPORTED                    | point                                   |
-| UNSUPPORTED                    | polygon                                 |
-| UNSUPPORTED                    | real                                    |
-| UNSUPPORTED                    | smallint                                |
-| UNSUPPORTED                    | smallserial                             |
-| UNSUPPORTED                    | serial                                  |
-| string without format; untyped | text                                    |
-| string with format="time"      | time [ (p) ] [ without time zone ]      |
-| UNSUPPORTED                    | time [ (p) ] with time zone             |
-| string with format="date-time" | timestamp [ (p) ] [ without time zone ] |
-| UNSUPPORTED                    | timestamp [ (p) ] with time zone        |
-| UNSUPPORTED                    | tsquery                                 |
-| UNSUPPORTED                    | tsvector                                |
-| UNSUPPORTED                    | txid_snapshot                           |
-| string with format="uuid"      | uuid                                    |
-| UNSUPPORTED                    | xml                                     |
+| UNSUPPORTED                                                                        | character [ (n) ]                       |
+| UNSUPPORTED                                                                        | character varying [ (n) ]               |
+| UNSUPPORTED                                                                        | cidr                                    |
+| UNSUPPORTED                                                                        | circle                                  |
+| string with format="date"                                                          | date                                    |
+| UNSUPPORTED                                                                        | double precision                        |
+| UNSUPPORTED                                                                        | inet                                    |
+| UNSUPPORTED                                                                        | interval [ fields ] [ (p) ]             |
+| UNSUPPORTED                                                                        | json                                    |
+| array; object                                                                      | jsonb                                   |
+| UNSUPPORTED                                                                        | line                                    |
+| UNSUPPORTED                                                                        | lseg                                    |
+| UNSUPPORTED                                                                        | macaddr                                 |
+| UNSUPPORTED                                                                        | macaddr8                                |
+| UNSUPPORTED                                                                        | money                                   |
+| number                                                                             | numeric [ (p, s) ]                      |
+| UNSUPPORTED                                                                        | path                                    |
+| UNSUPPORTED                                                                        | pg_lsn                                  |
+| UNSUPPORTED                                                                        | pg_snapshot                             |
+| UNSUPPORTED                                                                        | point                                   |
+| UNSUPPORTED                                                                        | polygon                                 |
+| UNSUPPORTED                                                                        | real                                    |
+| UNSUPPORTED                                                                        | smallserial                             |
+| UNSUPPORTED                                                                        | serial                                  |
+| string without format; untyped                                                     | text                                    |
+| string with format="time"                                                          | time [ (p) ] [ without time zone ]      |
+| UNSUPPORTED                                                                        | time [ (p) ] with time zone             |
+| string with format="date-time"                                                     | timestamp [ (p) ] [ without time zone ] |
+| UNSUPPORTED                                                                        | timestamp [ (p) ] with time zone        |
+| UNSUPPORTED                                                                        | tsquery                                 |
+| UNSUPPORTED                                                                        | tsvector                                |
+| UNSUPPORTED                                                                        | txid_snapshot                           |
+| string with format="uuid"                                                          | uuid                                    |
+| UNSUPPORTED                                                                        | xml                                     |
 
 Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array.
 
@@ -298,9 +298,28 @@ If a column has multiple jsonschema types, the following order is using to order
 - DECIMAL
 - BIGINT
 - INTEGER
+- SMALLINT
 - BOOLEAN
 - NOTYPE
 
+### Using the Singer catalog to narrow down the Postgres data types
+
+You can use [Singer catalog's schema](https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#schemas) to override the data types coming from the tap. The easiest way to do this is to use Meltano and its [`schema` setting](https://docs.meltano.com/concepts/plugins/#schema-extra) for the tap:
+
+```yaml
+# meltano.yml
+plugins:
+  extractors:
+  - name: tap-my-tap
+    schema:
+      some_stream_id:
+        my_column:
+          type: integer
+          # This will be mapped to 'smallint'
+          minimum: 0
+          maximum: 1000
+```
+
 ## Content Encoding Support
 
 Json Schema supports the [`contentEncoding` keyword](https://datatracker.ietf.org/doc/html/rfc4648#section-8), which can be used to specify the encoding of input string types.
diff --git a/target_postgres/connector.py b/target_postgres/connector.py
@@ -5,6 +5,7 @@
 import atexit
 import io
 import itertools
+import math
 import signal
 import sys
 import typing as t
@@ -18,7 +19,14 @@
 import sqlalchemy as sa
 from singer_sdk import SQLConnector
 from singer_sdk.connectors.sql import JSONSchemaToSQL
-from sqlalchemy.dialects.postgresql import ARRAY, BIGINT, BYTEA, JSONB, UUID
+from sqlalchemy.dialects.postgresql import (
+    ARRAY,
+    BIGINT,
+    BYTEA,
+    JSONB,
+    SMALLINT,
+    UUID,
+)
 from sqlalchemy.engine import URL
 from sqlalchemy.engine.url import make_url
 from sqlalchemy.types import (
@@ -273,6 +281,17 @@ def _handle_array_type(self, jsonschema: dict) -> ARRAY | JSONB:
         # Case 3: tuples
         return ARRAY(JSONB()) if isinstance(items, list) else JSONB()
 
+    def _handle_integer_type(self, jsonschema: dict) -> SMALLINT | INTEGER | BIGINT:
+        """Handle integer type."""
+        minimum = jsonschema.get("minimum", -math.inf)
+        maximum = jsonschema.get("maximum", math.inf)
+        if minimum >= -(2**15) and maximum < 2**15:
+            return SMALLINT()
+        if minimum >= -(2**31) and maximum < 2**31:
+            return INTEGER()
+
+        return BIGINT()
+
     @cached_property
     def jsonschema_to_sql(self) -> JSONSchemaToSQL:
         """Return a JSONSchemaToSQL instance with custom type handling."""
@@ -281,7 +300,7 @@ def jsonschema_to_sql(self) -> JSONSchemaToSQL:
             max_varchar_length=self.max_varchar_length,
         )
         to_sql.fallback_type = TEXT
-        to_sql.register_type_handler("integer", BIGINT)
+        to_sql.register_type_handler("integer", self._handle_integer_type)
         to_sql.register_type_handler("object", JSONB)
         to_sql.register_type_handler("array", self._handle_array_type)
         to_sql.register_format_handler("date-time", TIMESTAMP)
@@ -386,6 +405,7 @@ def pick_best_sql_type(sql_type_array: list):
             DECIMAL,
             BIGINT,
             INTEGER,
+            SMALLINT,
             BOOLEAN,
             NOTYPE,
         ]
diff --git a/target_postgres/tests/test_types.py b/target_postgres/tests/test_types.py
@@ -2,8 +2,9 @@
 
 import pytest
 import sqlalchemy as sa
+from sqlalchemy.dialects.postgresql import BIGINT, SMALLINT
 
-from target_postgres.connector import NOTYPE, PostgresConnector
+from target_postgres.connector import NOTYPE, JSONSchemaToPostgres, PostgresConnector
 
 
 @pytest.fixture
@@ -36,3 +37,70 @@ def connector():
 def test_type_hierarchy(connector, types, expected):
     """Test that types are merged correctly."""
     assert type(connector.merge_sql_types(types)) is expected
+
+
+class TestJSONSchemaToPostgres:
+    """Test JSONSchemaToPostgres class."""
+
+    @pytest.fixture
+    def to_postgres(self, connector: PostgresConnector):
+        """Create a JSONSchemaToPostgres instance."""
+        return connector.jsonschema_to_sql
+
+    def test_datetime_string(self, to_postgres: JSONSchemaToPostgres):
+        """Test conversion of JSON schema string to Postgres datetime."""
+        result = to_postgres.to_sql_type({"type": "string", "format": "date-time"})
+        assert type(result) is sa.TIMESTAMP
+
+    @pytest.mark.parametrize(
+        ("jsonschema", "expected"),
+        [
+            pytest.param({"type": "integer"}, BIGINT, id="default"),
+            pytest.param({"type": ["integer", "null"]}, BIGINT, id="default-nullable"),
+            pytest.param(
+                {
+                    "type": "integer",
+                    "minimum": 0,
+                    "maximum": 2**15 - 1,
+                },
+                SMALLINT,
+                id="smallint",
+            ),
+            pytest.param(
+                {
+                    "type": "integer",
+                    "minimum": -5,
+                    "maximum": 5,
+                },
+                SMALLINT,
+                id="negative-smallint",
+            ),
+            pytest.param(
+                {
+                    "type": "integer",
+                    "minimum": 0,
+                    "maximum": 2**31 - 1,
+                },
+                sa.INTEGER,
+                id="integer",
+            ),
+            pytest.param(
+                {
+                    "type": "integer",
+                    "minimum": 0,
+                    "maximum": 2**31 + 1,
+                },
+                BIGINT,
+                id="bigint",
+            ),
+        ],
+    )
+    def test_integers(
+        self,
+        to_postgres: JSONSchemaToPostgres,
+        jsonschema: dict,
+        expected: type[sa.types.TypeEngine],
+    ):
+        """Test conversion of JSON schema types to Postgres types."""
+        result = to_postgres.to_sql_type(jsonschema)
+        assert type(result) is expected