2929DISTINCT_VALUE_LIMIT = 10
3030
3131NODE_PROPERTIES_QUERY = (
32- "CALL apoc.meta.data() "
32+ "CALL apoc.meta.data({sample: $SAMPLE} ) "
3333 "YIELD label, other, elementType, type, property "
3434 "WHERE NOT type = 'RELATIONSHIP' AND elementType = 'node' "
3535 "AND NOT label IN $EXCLUDED_LABELS "
3838)
3939
4040REL_PROPERTIES_QUERY = (
41- "CALL apoc.meta.data() "
41+ "CALL apoc.meta.data({sample: $SAMPLE} ) "
4242 "YIELD label, other, elementType, type, property "
4343 "WHERE NOT type = 'RELATIONSHIP' AND elementType = 'relationship' "
4444 "AND NOT label in $EXCLUDED_LABELS "
4747)
4848
4949REL_QUERY = (
50- "CALL apoc.meta.data() "
50+ "CALL apoc.meta.data({sample: $SAMPLE} ) "
5151 "YIELD label, other, elementType, type, property "
5252 "WHERE type = 'RELATIONSHIP' AND elementType = 'node' "
5353 "UNWIND other AS other_node "
@@ -186,6 +186,7 @@ def get_schema(
186186 database : Optional [str ] = None ,
187187 timeout : Optional [float ] = None ,
188188 sanitize : bool = False ,
189+ sample : int = 1000 ,
189190) -> str :
190191 """
191192 Returns the schema of the graph as a string with following format:
@@ -210,6 +211,8 @@ def get_schema(
210211 sanitize (bool): A flag to indicate whether to remove lists with
211212 more than 128 elements from results. Useful for removing
212213 embedding-like properties from database responses. Default is False.
214+ sample (int): Number of nodes to sample for the apoc.meta.data procedure. Setting sample to -1 will remove sampling.
215+ Defaults to 1000.
213216
214217
215218 Returns:
@@ -221,6 +224,7 @@ def get_schema(
221224 database = database ,
222225 timeout = timeout ,
223226 sanitize = sanitize ,
227+ sample = sample ,
224228 )
225229 return format_schema (structured_schema , is_enhanced )
226230
@@ -231,6 +235,7 @@ def get_structured_schema(
231235 database : Optional [str ] = None ,
232236 timeout : Optional [float ] = None ,
233237 sanitize : bool = False ,
238+ sample : int = 1000 ,
234239) -> dict [str , Any ]:
235240 """
236241 Returns the structured schema of the graph.
@@ -280,6 +285,8 @@ def get_structured_schema(
280285 sanitize (bool): A flag to indicate whether to remove lists with
281286 more than 128 elements from results. Useful for removing
282287 embedding-like properties from database responses. Default is False.
288+ sample (int): Number of nodes to sample for the apoc.meta.data procedure. Setting sample to -1 will remove sampling.
289+ Defaults to 1000.
283290
284291 Returns:
285292 dict[str, Any]: the graph schema information in a structured format.
@@ -291,7 +298,8 @@ def get_structured_schema(
291298 query = NODE_PROPERTIES_QUERY ,
292299 params = {
293300 "EXCLUDED_LABELS" : EXCLUDED_LABELS
294- + [BASE_ENTITY_LABEL , BASE_KG_BUILDER_LABEL ]
301+ + [BASE_ENTITY_LABEL , BASE_KG_BUILDER_LABEL ],
302+ "SAMPLE" : sample ,
295303 },
296304 database = database ,
297305 timeout = timeout ,
@@ -304,7 +312,7 @@ def get_structured_schema(
304312 for data in query_database (
305313 driver = driver ,
306314 query = REL_PROPERTIES_QUERY ,
307- params = {"EXCLUDED_LABELS" : EXCLUDED_RELS },
315+ params = {"EXCLUDED_LABELS" : EXCLUDED_RELS , "SAMPLE" : sample },
308316 database = database ,
309317 timeout = timeout ,
310318 sanitize = sanitize ,
@@ -318,7 +326,8 @@ def get_structured_schema(
318326 query = REL_QUERY ,
319327 params = {
320328 "EXCLUDED_LABELS" : EXCLUDED_LABELS
321- + [BASE_ENTITY_LABEL , BASE_KG_BUILDER_LABEL ]
329+ + [BASE_ENTITY_LABEL , BASE_KG_BUILDER_LABEL ],
330+ "SAMPLE" : sample ,
322331 },
323332 database = database ,
324333 timeout = timeout ,
0 commit comments