Skip to content

Commit 01eda08

Browse files
committed
test: Add test cases for arrays and objects
In PostgreSQL, all boils down to the `jsonb[]` type, but arrays are reflected as `sqlalchemy.dialects.postgresql.ARRAY` instead of `sqlalchemy.dialects.postgresql.JSONB`. In order to prepare for more advanced type mangling & validation, and to better support databases pretending to be compatible with PostgreSQL, the new test cases exercise arrays with different kinds of inner values, because, on other databases, ARRAYs may need to have uniform content. Along the lines, it adds a `verify_schema` utility function in the spirit of the `verify_data` function, refactored and generalized from the `test_anyof` test case.
1 parent fb9a3bd commit 01eda08

File tree

8 files changed

+186
-35
lines changed

8 files changed

+186
-35
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"type": "SCHEMA", "stream": "array_boolean", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "boolean"}}}}}
2+
{"type": "RECORD", "stream": "array_boolean", "record": {"id": 1, "value": [ true, false ]}}
3+
{"type": "RECORD", "stream": "array_boolean", "record": {"id": 2, "value": [ false ]}}
4+
{"type": "RECORD", "stream": "array_boolean", "record": {"id": 3, "value": [ false, true, true, false ]}}
5+
{"type": "STATE", "value": {"array_boolean": 3}}

target_postgres/tests/data_files/array_data.singer

Lines changed: 0 additions & 6 deletions
This file was deleted.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"type": "SCHEMA", "stream": "array_float_vector", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "number"}, "sql": {"type": "vector", "dim": 4}}}}}
2+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 1, "value": [ 1.0, 2.0, 1.0, 1.0 ]}}
3+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 2, "value": [ 1.0, 1.0, 1.0, 2.0 ]}}
4+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 3, "value": [ 2.0, 1.0, 1.0, 1.0 ]}}
5+
{"type": "STATE", "value": {"array_float_vector": 3}}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"type": "SCHEMA", "stream": "array_number", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "number"}}}}}
2+
{"type": "RECORD", "stream": "array_number", "record": {"id": 1, "value": [ 42.42, 84.84, 23 ]}}
3+
{"type": "RECORD", "stream": "array_number", "record": {"id": 2, "value": [ 1.0 ]}}
4+
{"type": "RECORD", "stream": "array_number", "record": {"id": 3, "value": [ 1.11, 2.22, 3, 4, 5.55 ]}}
5+
{"type": "STATE", "value": {"array_number": 3}}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{"type": "SCHEMA", "stream": "array_string", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array","items": {"type": "string"}}}}}
2+
{"type": "RECORD", "stream": "array_string", "record": {"id": 1, "value": [ "apple", "orange", "pear" ]}}
3+
{"type": "RECORD", "stream": "array_string", "record": {"id": 2, "value": [ "banana", "apple" ]}}
4+
{"type": "RECORD", "stream": "array_string", "record": {"id": 3, "value": [ "pear" ]}}
5+
{"type": "RECORD", "stream": "array_string", "record": {"id": 4, "value": [ "orange", "banana", "apple", "pear" ]}}
6+
{"type": "STATE", "value": {"array_string": 4}}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"type": "SCHEMA", "stream": "array_timestamp", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "string", "format": "date-time"}}}}}
2+
{"type": "RECORD", "stream": "array_timestamp", "record": {"id": 1, "value": [ "2023-12-13T01:15:02", "2023-12-13T01:16:02" ]}}
3+
{"type": "RECORD", "stream": "array_timestamp", "record": {"id": 2, "value": [ "2023-12-13T01:15:02" ]}}
4+
{"type": "RECORD", "stream": "array_timestamp", "record": {"id": 3, "value": [ "2023-12-13T01:15:02", "2023-12-13T01:16:02", "2023-12-13T01:17:02" ]}}
5+
{"type": "STATE", "value": {"array_timestamp": 3}}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"type": "SCHEMA", "stream": "object_mixed", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "object"}}}}
2+
{"type": "RECORD", "stream": "object_mixed", "record": {"id": 1, "value": {"string": "foo", "integer": 42, "float": 42.42, "timestamp": "2023-12-13T01:15:02", "array_boolean": [true, false], "array_float": [42.42, 84.84], "array_integer": [42, 84], "array_string": ["foo", "bar"], "nested_object": {"foo": "bar"}}}}
3+
{"type": "STATE", "value": {"object_mixed": 1}}

target_postgres/tests/test_target_postgres.py

Lines changed: 157 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
import sqlalchemy
1212
from singer_sdk.exceptions import MissingKeyPropertiesError
1313
from singer_sdk.testing import get_target_test_class, sync_end_to_end
14-
from sqlalchemy.dialects.postgresql import ARRAY
15-
from sqlalchemy.types import TEXT, TIMESTAMP
14+
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
15+
from sqlalchemy.types import BIGINT, TEXT, TIMESTAMP
1616

1717
from target_postgres.connector import PostgresConnector
1818
from target_postgres.target import TargetPostgres
@@ -94,7 +94,7 @@ def verify_data(
9494
9595
Args:
9696
target: The target to obtain a database connection from.
97-
full_table_name: The schema and table name of the table to check data for.
97+
table_name: The schema and table name of the table to check data for.
9898
primary_key: The primary key of the table.
9999
number_of_rows: The expected number of rows that should be in the table.
100100
check_data: A dictionary representing the full contents of the first row in the
@@ -134,6 +134,43 @@ def verify_data(
134134
assert result.first()[0] == number_of_rows
135135

136136

137+
def verify_schema(
138+
target: TargetPostgres,
139+
table_name: str,
140+
check_columns: dict = None,
141+
):
142+
"""Checks whether the schema of a database table matches the provided column definitions.
143+
144+
Args:
145+
target: The target to obtain a database connection from.
146+
table_name: The schema and table name of the table to check data for.
147+
check_columns: A dictionary mapping column names to their definitions. Currently,
148+
it is all about the `type` attribute which is compared.
149+
"""
150+
engine = create_engine(target)
151+
schema = target.config["default_target_schema"]
152+
with engine.connect() as connection:
153+
meta = sqlalchemy.MetaData()
154+
table = sqlalchemy.Table(
155+
table_name, meta, schema=schema, autoload_with=connection
156+
)
157+
for column in table.c:
158+
# Ignore `_sdc` columns for now.
159+
if column.name.startswith("_sdc"):
160+
continue
161+
try:
162+
column_type_expected = check_columns[column.name]["type"]
163+
except KeyError:
164+
raise ValueError(
165+
f"Invalid check_columns - missing definition for column: {column.name}"
166+
)
167+
if not isinstance(column.type, column_type_expected):
168+
raise TypeError(
169+
f"Column '{column.name}' (with type '{column.type}') "
170+
f"does not match expected type: {column_type_expected}"
171+
)
172+
173+
137174
def test_sqlalchemy_url_config(postgres_config_no_ssl):
138175
"""Be sure that passing a sqlalchemy_url works
139176
@@ -406,11 +443,111 @@ def test_duplicate_records(postgres_target):
406443
verify_data(postgres_target, "test_duplicate_records", 2, "id", row)
407444

408445

409-
def test_array_data(postgres_target):
410-
file_name = "array_data.singer"
446+
def test_array_boolean(postgres_target):
447+
file_name = "array_boolean.singer"
448+
singer_file_to_target(file_name, postgres_target)
449+
row = {"id": 1, "value": [True, False]}
450+
verify_data(postgres_target, "array_boolean", 3, "id", row)
451+
verify_schema(
452+
postgres_target,
453+
"array_boolean",
454+
check_columns={
455+
"id": {"type": BIGINT},
456+
"value": {"type": ARRAY},
457+
},
458+
)
459+
460+
461+
@pytest.mark.skip("Needs pgvector support")
462+
def test_array_float_vector(postgres_target):
463+
file_name = "array_float_vector.singer"
464+
singer_file_to_target(file_name, postgres_target)
465+
row = {
466+
"id": 1,
467+
"value": [Decimal("1.0"), Decimal("2.0"), Decimal("1.0"), Decimal("1.0")],
468+
}
469+
verify_data(postgres_target, "array_float_vector", 3, "id", row)
470+
verify_schema(
471+
postgres_target,
472+
"array_float_vector",
473+
check_columns={
474+
"id": {"type": BIGINT},
475+
"value": {"type": ARRAY},
476+
},
477+
)
478+
479+
480+
def test_array_number(postgres_target):
481+
file_name = "array_number.singer"
482+
singer_file_to_target(file_name, postgres_target)
483+
row = {"id": 1, "value": [Decimal("42.42"), Decimal("84.84"), 23]}
484+
verify_data(postgres_target, "array_number", 3, "id", row)
485+
verify_schema(
486+
postgres_target,
487+
"array_number",
488+
check_columns={
489+
"id": {"type": BIGINT},
490+
"value": {"type": ARRAY},
491+
},
492+
)
493+
494+
495+
def test_array_string(postgres_target):
496+
file_name = "array_string.singer"
497+
singer_file_to_target(file_name, postgres_target)
498+
row = {"id": 1, "value": ["apple", "orange", "pear"]}
499+
verify_data(postgres_target, "array_string", 4, "id", row)
500+
verify_schema(
501+
postgres_target,
502+
"array_string",
503+
check_columns={
504+
"id": {"type": BIGINT},
505+
"value": {"type": ARRAY},
506+
},
507+
)
508+
509+
510+
def test_array_timestamp(postgres_target):
511+
file_name = "array_timestamp.singer"
411512
singer_file_to_target(file_name, postgres_target)
412-
row = {"id": 1, "fruits": ["apple", "orange", "pear"]}
413-
verify_data(postgres_target, "test_carts", 4, "id", row)
513+
row = {"id": 1, "value": ["2023-12-13T01:15:02", "2023-12-13T01:16:02"]}
514+
verify_data(postgres_target, "array_timestamp", 3, "id", row)
515+
verify_schema(
516+
postgres_target,
517+
"array_timestamp",
518+
check_columns={
519+
"id": {"type": BIGINT},
520+
"value": {"type": ARRAY},
521+
},
522+
)
523+
524+
525+
def test_object_mixed(postgres_target):
526+
file_name = "object_mixed.singer"
527+
singer_file_to_target(file_name, postgres_target)
528+
row = {
529+
"id": 1,
530+
"value": {
531+
"string": "foo",
532+
"integer": 42,
533+
"float": Decimal("42.42"),
534+
"timestamp": "2023-12-13T01:15:02",
535+
"array_boolean": [True, False],
536+
"array_float": [Decimal("42.42"), Decimal("84.84")],
537+
"array_integer": [42, 84],
538+
"array_string": ["foo", "bar"],
539+
"nested_object": {"foo": "bar"},
540+
},
541+
}
542+
verify_data(postgres_target, "object_mixed", 1, "id", row)
543+
verify_schema(
544+
postgres_target,
545+
"object_mixed",
546+
check_columns={
547+
"id": {"type": BIGINT},
548+
"value": {"type": JSONB},
549+
},
550+
)
414551

415552

416553
def test_encoded_string_data(postgres_target):
@@ -456,41 +593,32 @@ def test_large_int(postgres_target):
456593

457594
def test_anyof(postgres_target):
458595
"""Test that anyOf is handled correctly"""
459-
engine = create_engine(postgres_target)
460596
table_name = "commits"
461597
file_name = f"{table_name}.singer"
462-
schema = postgres_target.config["default_target_schema"]
463598
singer_file_to_target(file_name, postgres_target)
464-
with engine.connect() as connection:
465-
meta = sqlalchemy.MetaData()
466-
table = sqlalchemy.Table(
467-
"commits", meta, schema=schema, autoload_with=connection
468-
)
469-
for column in table.c:
470-
# {"type":"string"}
471-
if column.name == "id":
472-
assert isinstance(column.type, TEXT)
473599

600+
verify_schema(
601+
postgres_target,
602+
table_name,
603+
check_columns={
604+
# {"type":"string"}
605+
"id": {"type": TEXT},
474606
# Any of nullable date-time.
475607
# Note that postgres timestamp is equivalent to jsonschema date-time.
476608
# {"anyOf":[{"type":"string","format":"date-time"},{"type":"null"}]}
477-
if column.name in {"authored_date", "committed_date"}:
478-
assert isinstance(column.type, TIMESTAMP)
479-
609+
"authored_date": {"type": TIMESTAMP},
610+
"committed_date": {"type": TIMESTAMP},
480611
# Any of nullable array of strings or single string.
481612
# {"anyOf":[{"type":"array","items":{"type":["null","string"]}},{"type":"string"},{"type":"null"}]}
482-
if column.name == "parent_ids":
483-
assert isinstance(column.type, ARRAY)
484-
613+
"parent_ids": {"type": ARRAY},
485614
# Any of nullable string.
486615
# {"anyOf":[{"type":"string"},{"type":"null"}]}
487-
if column.name == "commit_message":
488-
assert isinstance(column.type, TEXT)
489-
616+
"commit_message": {"type": TEXT},
490617
# Any of nullable string or integer.
491618
# {"anyOf":[{"type":"string"},{"type":"integer"},{"type":"null"}]}
492-
if column.name == "legacy_id":
493-
assert isinstance(column.type, TEXT)
619+
"legacy_id": {"type": TEXT},
620+
},
621+
)
494622

495623

496624
def test_new_array_column(postgres_target):

0 commit comments

Comments
 (0)