Skip to content

Commit 522b8b9

Browse files
committed
jsl-llama.cpp upgrade python side
1 parent fb9c8e7 commit 522b8b9

File tree

4 files changed

+120
-84
lines changed

4 files changed

+120
-84
lines changed

python/sparknlp/annotator/seq2seq/auto_gguf_model.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,9 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFMo
253253
nCtx=4096,
254254
nBatch=512,
255255
embedding=False,
256-
nPredict=100
256+
nPredict=100,
257+
nGpuLayers=99,
258+
systemPrompt="You are a helpful assistant."
257259
)
258260

259261
@staticmethod

python/sparknlp/common/properties.py

Lines changed: 100 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -765,14 +765,14 @@ class HasLlamaCppProperties:
765765
# -------- MODEl PARAMETERS --------
766766
nThreads = Param(Params._dummy(), "nThreads", "Set the number of threads to use during generation",
767767
typeConverter=TypeConverters.toInt)
768-
nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
769-
typeConverter=TypeConverters.toInt)
768+
# nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
769+
# typeConverter=TypeConverters.toInt)
770770
nThreadsBatch = Param(Params._dummy(), "nThreadsBatch",
771771
"Set the number of threads to use during batch and prompt processing",
772772
typeConverter=TypeConverters.toInt)
773-
nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
774-
"Set the number of threads to use during batch and prompt processing",
775-
typeConverter=TypeConverters.toInt)
773+
# nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
774+
# "Set the number of threads to use during batch and prompt processing",
775+
# typeConverter=TypeConverters.toInt)
776776
nCtx = Param(Params._dummy(), "nCtx", "Set the size of the prompt context", typeConverter=TypeConverters.toInt)
777777
nBatch = Param(Params._dummy(), "nBatch",
778778
"Set the logical batch size for prompt processing (must be >=32 to use BLAS)",
@@ -782,12 +782,12 @@ class HasLlamaCppProperties:
782782
typeConverter=TypeConverters.toInt)
783783
nDraft = Param(Params._dummy(), "nDraft", "Set the number of tokens to draft for speculative decoding",
784784
typeConverter=TypeConverters.toInt)
785-
nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
786-
typeConverter=TypeConverters.toInt)
787-
nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
788-
typeConverter=TypeConverters.toInt)
789-
pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
790-
typeConverter=TypeConverters.toFloat)
785+
# nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
786+
# typeConverter=TypeConverters.toInt)
787+
# nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
788+
# typeConverter=TypeConverters.toInt)
789+
# pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
790+
# typeConverter=TypeConverters.toFloat)
791791
nGpuLayers = Param(Params._dummy(), "nGpuLayers", "Set the number of layers to store in VRAM (-1 - use default)",
792792
typeConverter=TypeConverters.toInt)
793793
nGpuLayersDraft = Param(Params._dummy(), "nGpuLayersDraft",
@@ -802,10 +802,10 @@ class HasLlamaCppProperties:
802802
typeConverter=TypeConverters.toString)
803803
mainGpu = Param(Params._dummy(), "mainGpu", "Set the main GPU that is used for scratch and small tensors.",
804804
typeConverter=TypeConverters.toInt)
805-
tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
806-
typeConverter=TypeConverters.toListFloat)
807-
grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
808-
grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
805+
# tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
806+
# typeConverter=TypeConverters.toListFloat)
807+
# grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
808+
# grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
809809
ropeFreqBase = Param(Params._dummy(), "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling",
810810
typeConverter=TypeConverters.toFloat)
811811
ropeFreqScale = Param(Params._dummy(), "ropeFreqScale",
@@ -837,7 +837,7 @@ class HasLlamaCppProperties:
837837
typeConverter=TypeConverters.toString)
838838
# Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
839839
#
840-
# - UNSPECIFIED: Don't use any scaling
840+
# - NONE: Don't use any scaling
841841
# - LINEAR: Linear scaling
842842
# - YARN: YaRN RoPE scaling
843843
ropeScalingType = Param(Params._dummy(), "ropeScalingType",
@@ -848,26 +848,28 @@ class HasLlamaCppProperties:
848848
# - 0 NONE: Don't use any pooling
849849
# - 1 MEAN: Mean Pooling
850850
# - 2 CLS: CLS Pooling
851+
# - 3 LAST: Last token pooling
852+
# - 4 RANK: For reranked models
851853
poolingType = Param(Params._dummy(), "poolingType",
852854
"Set the pooling type for embeddings, use model default if unspecified",
853855
typeConverter=TypeConverters.toString)
854856
modelDraft = Param(Params._dummy(), "modelDraft", "Set the draft model for speculative decoding",
855857
typeConverter=TypeConverters.toString)
856858
modelAlias = Param(Params._dummy(), "modelAlias", "Set a model alias", typeConverter=TypeConverters.toString)
857-
lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
858-
"Set path to static lookup cache to use for lookup decoding (not updated by generation)",
859-
typeConverter=TypeConverters.toString)
860-
lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
861-
"Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
862-
typeConverter=TypeConverters.toString)
859+
# lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
860+
# "Set path to static lookup cache to use for lookup decoding (not updated by generation)",
861+
# typeConverter=TypeConverters.toString)
862+
# lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
863+
# "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
864+
# typeConverter=TypeConverters.toString)
863865
# loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
864866
embedding = Param(Params._dummy(), "embedding", "Whether to load model with embedding support",
865867
typeConverter=TypeConverters.toBoolean)
866868
flashAttention = Param(Params._dummy(), "flashAttention", "Whether to enable Flash Attention",
867869
typeConverter=TypeConverters.toBoolean)
868-
inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
869-
"Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
870-
typeConverter=TypeConverters.toBoolean)
870+
# inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
871+
# "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
872+
# typeConverter=TypeConverters.toBoolean)
871873
useMmap = Param(Params._dummy(), "useMmap",
872874
"Whether to use memory-map model (faster load but may increase pageouts if not using mlock)",
873875
typeConverter=TypeConverters.toBoolean)
@@ -948,17 +950,17 @@ def setNThreads(self, nThreads: int):
948950
"""Set the number of threads to use during generation"""
949951
return self._set(nThreads=nThreads)
950952

951-
def setNThreadsDraft(self, nThreadsDraft: int):
952-
"""Set the number of threads to use during draft generation"""
953-
return self._set(nThreadsDraft=nThreadsDraft)
953+
# def setNThreadsDraft(self, nThreadsDraft: int):
954+
# """Set the number of threads to use during draft generation"""
955+
# return self._set(nThreadsDraft=nThreadsDraft)
954956

955957
def setNThreadsBatch(self, nThreadsBatch: int):
956958
"""Set the number of threads to use during batch and prompt processing"""
957959
return self._set(nThreadsBatch=nThreadsBatch)
958960

959-
def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
960-
"""Set the number of threads to use during batch and prompt processing"""
961-
return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
961+
# def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
962+
# """Set the number of threads to use during batch and prompt processing"""
963+
# return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
962964

963965
def setNCtx(self, nCtx: int):
964966
"""Set the size of the prompt context"""
@@ -976,17 +978,17 @@ def setNDraft(self, nDraft: int):
976978
"""Set the number of tokens to draft for speculative decoding"""
977979
return self._set(nDraft=nDraft)
978980

979-
def setNChunks(self, nChunks: int):
980-
"""Set the maximal number of chunks to process"""
981-
return self._set(nChunks=nChunks)
981+
# def setNChunks(self, nChunks: int):
982+
# """Set the maximal number of chunks to process"""
983+
# return self._set(nChunks=nChunks)
982984

983-
def setNSequences(self, nSequences: int):
984-
"""Set the number of sequences to decode"""
985-
return self._set(nSequences=nSequences)
985+
# def setNSequences(self, nSequences: int):
986+
# """Set the number of sequences to decode"""
987+
# return self._set(nSequences=nSequences)
986988

987-
def setPSplit(self, pSplit: float):
988-
"""Set the speculative decoding split probability"""
989-
return self._set(pSplit=pSplit)
989+
# def setPSplit(self, pSplit: float):
990+
# """Set the speculative decoding split probability"""
991+
# return self._set(pSplit=pSplit)
990992

991993
def setNGpuLayers(self, nGpuLayers: int):
992994
"""Set the number of layers to store in VRAM (-1 - use default)"""
@@ -1004,17 +1006,17 @@ def setMainGpu(self, mainGpu: int):
10041006
"""Set the main GPU that is used for scratch and small tensors."""
10051007
return self._set(mainGpu=mainGpu)
10061008

1007-
def setTensorSplit(self, tensorSplit: List[float]):
1008-
"""Set how split tensors should be distributed across GPUs"""
1009-
return self._set(tensorSplit=tensorSplit)
1009+
# def setTensorSplit(self, tensorSplit: List[float]):
1010+
# """Set how split tensors should be distributed across GPUs"""
1011+
# return self._set(tensorSplit=tensorSplit)
10101012

1011-
def setGrpAttnN(self, grpAttnN: int):
1012-
"""Set the group-attention factor"""
1013-
return self._set(grpAttnN=grpAttnN)
1013+
# def setGrpAttnN(self, grpAttnN: int):
1014+
# """Set the group-attention factor"""
1015+
# return self._set(grpAttnN=grpAttnN)
10141016

1015-
def setGrpAttnW(self, grpAttnW: int):
1016-
"""Set the group-attention width"""
1017-
return self._set(grpAttnW=grpAttnW)
1017+
# def setGrpAttnW(self, grpAttnW: int):
1018+
# """Set the group-attention width"""
1019+
# return self._set(grpAttnW=grpAttnW)
10181020

10191021
def setRopeFreqBase(self, ropeFreqBase: float):
10201022
"""Set the RoPE base frequency, used by NTK-aware scaling"""
@@ -1049,7 +1051,16 @@ def setDefragmentationThreshold(self, defragmentationThreshold: float):
10491051
return self._set(defragmentationThreshold=defragmentationThreshold)
10501052

10511053
def setNumaStrategy(self, numaStrategy: str):
1052-
"""Set optimization strategies that help on some NUMA systems (if available)"""
1054+
"""Set optimization strategies that help on some NUMA systems (if available)
1055+
1056+
Possible values:
1057+
1058+
- DISABLED: No NUMA optimizations
1059+
- DISTRIBUTE: spread execution evenly over all
1060+
- ISOLATE: only spawn threads on CPUs on the node that execution started on
1061+
- NUMA_CTL: use the CPU map provided by numactl
1062+
- MIRROR: Mirrors the model across NUMA nodes
1063+
"""
10531064
numaUpper = numaStrategy.upper()
10541065
numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"]
10551066
if numaUpper not in numaStrategies:
@@ -1060,13 +1071,36 @@ def setNumaStrategy(self, numaStrategy: str):
10601071
return self._set(numaStrategy=numaStrategy)
10611072

10621073
def setRopeScalingType(self, ropeScalingType: str):
1063-
"""Set the RoPE frequency scaling method, defaults to linear unless specified by the model"""
1064-
return self._set(ropeScalingType=ropeScalingType)
1074+
"""Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
1075+
1076+
Possible values:
1077+
1078+
- NONE: Don't use any scaling
1079+
- LINEAR: Linear scaling
1080+
- YARN: YaRN RoPE scaling
1081+
"""
1082+
ropeScalingTypeUpper = ropeScalingType.upper()
1083+
ropeScalingTypes = ["NONE", "LINEAR", "YARN"]
1084+
if ropeScalingTypeUpper not in ropeScalingTypes:
1085+
raise ValueError(
1086+
f"Invalid RoPE scaling type: {ropeScalingType}. "
1087+
+ f"Valid values are: {ropeScalingTypes}"
1088+
)
1089+
return self._set(ropeScalingType=ropeScalingTypeUpper)
10651090

10661091
def setPoolingType(self, poolingType: str):
1067-
"""Set the pooling type for embeddings, use model default if unspecified"""
1092+
"""Set the pooling type for embeddings, use model default if unspecified
1093+
1094+
Possible values:
1095+
1096+
- 0 NONE: Don't use any pooling
1097+
- 1 MEAN: Mean Pooling
1098+
- 2 CLS: CLS Pooling
1099+
- 3 LAST: Last token pooling
1100+
- 4 RANK: For reranked models
1101+
"""
10681102
poolingTypeUpper = poolingType.upper()
1069-
poolingTypes = ["NONE", "MEAN", "CLS", "LAST"]
1103+
poolingTypes = ["NONE", "MEAN", "CLS", "LAST", "RANK"]
10701104
if poolingTypeUpper not in poolingTypes:
10711105
raise ValueError(
10721106
f"Invalid pooling type: {poolingType}. "
@@ -1082,13 +1116,13 @@ def setModelAlias(self, modelAlias: str):
10821116
"""Set a model alias"""
10831117
return self._set(modelAlias=modelAlias)
10841118

1085-
def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
1086-
"""Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
1087-
return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
1119+
# def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
1120+
# """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
1121+
# return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
10881122

1089-
def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
1090-
"""Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
1091-
return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
1123+
# def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
1124+
# """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
1125+
# return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
10921126

10931127
def setEmbedding(self, embedding: bool):
10941128
"""Whether to load model with embedding support"""
@@ -1098,9 +1132,9 @@ def setFlashAttention(self, flashAttention: bool):
10981132
"""Whether to enable Flash Attention"""
10991133
return self._set(flashAttention=flashAttention)
11001134

1101-
def setInputPrefixBos(self, inputPrefixBos: bool):
1102-
"""Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
1103-
return self._set(inputPrefixBos=inputPrefixBos)
1135+
# def setInputPrefixBos(self, inputPrefixBos: bool):
1136+
# """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
1137+
# return self._set(inputPrefixBos=inputPrefixBos)
11041138

11051139
def setUseMmap(self, useMmap: bool):
11061140
"""Whether to use memory-map model (faster load but may increase pageouts if not using mlock)"""
@@ -1260,9 +1294,9 @@ def setTokenBias(self, tokenBias: Dict[str, float]):
12601294
"""Set token id bias"""
12611295
return self._call_java("setTokenBias", tokenBias)
12621296

1263-
def setLoraAdapters(self, loraAdapters: Dict[str, float]):
1264-
"""Set LoRA adapters with their scaling factors"""
1265-
return self._call_java("setLoraAdapters", loraAdapters)
1297+
# def setLoraAdapters(self, loraAdapters: Dict[str, float]):
1298+
# """Set LoRA adapters with their scaling factors"""
1299+
# return self._call_java("setLoraAdapters", loraAdapters)
12661300

12671301
def getMetadata(self):
12681302
"""Gets the metadata of the model"""

python/test/annotator/embeddings/auto_gguf_embeddings_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,8 @@ def runTest(self):
153153
.setInputCols("document")
154154
.setOutputCol("embeddings")
155155
.setBatchSize(4)
156-
.setNUbatch(2048)
157-
.setNBatch(2048)
156+
.setNUbatch(4096)
157+
.setNBatch(4096)
158158
)
159159
pipeline = Pipeline().setStages([self.document_assembler, model])
160160
results = pipeline.fit(self.long_data).transform(self.long_data)

0 commit comments

Comments
 (0)