@@ -765,14 +765,14 @@ class HasLlamaCppProperties:
765
765
# -------- MODEl PARAMETERS --------
766
766
nThreads = Param (Params ._dummy (), "nThreads" , "Set the number of threads to use during generation" ,
767
767
typeConverter = TypeConverters .toInt )
768
- nThreadsDraft = Param (Params ._dummy (), "nThreadsDraft" , "Set the number of threads to use during draft generation" ,
769
- typeConverter = TypeConverters .toInt )
768
+ # nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
769
+ # typeConverter=TypeConverters.toInt)
770
770
nThreadsBatch = Param (Params ._dummy (), "nThreadsBatch" ,
771
771
"Set the number of threads to use during batch and prompt processing" ,
772
772
typeConverter = TypeConverters .toInt )
773
- nThreadsBatchDraft = Param (Params ._dummy (), "nThreadsBatchDraft" ,
774
- "Set the number of threads to use during batch and prompt processing" ,
775
- typeConverter = TypeConverters .toInt )
773
+ # nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
774
+ # "Set the number of threads to use during batch and prompt processing",
775
+ # typeConverter=TypeConverters.toInt)
776
776
nCtx = Param (Params ._dummy (), "nCtx" , "Set the size of the prompt context" , typeConverter = TypeConverters .toInt )
777
777
nBatch = Param (Params ._dummy (), "nBatch" ,
778
778
"Set the logical batch size for prompt processing (must be >=32 to use BLAS)" ,
@@ -782,12 +782,12 @@ class HasLlamaCppProperties:
782
782
typeConverter = TypeConverters .toInt )
783
783
nDraft = Param (Params ._dummy (), "nDraft" , "Set the number of tokens to draft for speculative decoding" ,
784
784
typeConverter = TypeConverters .toInt )
785
- nChunks = Param (Params ._dummy (), "nChunks" , "Set the maximal number of chunks to process" ,
786
- typeConverter = TypeConverters .toInt )
787
- nSequences = Param (Params ._dummy (), "nSequences" , "Set the number of sequences to decode" ,
788
- typeConverter = TypeConverters .toInt )
789
- pSplit = Param (Params ._dummy (), "pSplit" , "Set the speculative decoding split probability" ,
790
- typeConverter = TypeConverters .toFloat )
785
+ # nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
786
+ # typeConverter=TypeConverters.toInt)
787
+ # nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
788
+ # typeConverter=TypeConverters.toInt)
789
+ # pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
790
+ # typeConverter=TypeConverters.toFloat)
791
791
nGpuLayers = Param (Params ._dummy (), "nGpuLayers" , "Set the number of layers to store in VRAM (-1 - use default)" ,
792
792
typeConverter = TypeConverters .toInt )
793
793
nGpuLayersDraft = Param (Params ._dummy (), "nGpuLayersDraft" ,
@@ -802,10 +802,10 @@ class HasLlamaCppProperties:
802
802
typeConverter = TypeConverters .toString )
803
803
mainGpu = Param (Params ._dummy (), "mainGpu" , "Set the main GPU that is used for scratch and small tensors." ,
804
804
typeConverter = TypeConverters .toInt )
805
- tensorSplit = Param (Params ._dummy (), "tensorSplit" , "Set how split tensors should be distributed across GPUs" ,
806
- typeConverter = TypeConverters .toListFloat )
807
- grpAttnN = Param (Params ._dummy (), "grpAttnN" , "Set the group-attention factor" , typeConverter = TypeConverters .toInt )
808
- grpAttnW = Param (Params ._dummy (), "grpAttnW" , "Set the group-attention width" , typeConverter = TypeConverters .toInt )
805
+ # tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
806
+ # typeConverter=TypeConverters.toListFloat)
807
+ # grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
808
+ # grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
809
809
ropeFreqBase = Param (Params ._dummy (), "ropeFreqBase" , "Set the RoPE base frequency, used by NTK-aware scaling" ,
810
810
typeConverter = TypeConverters .toFloat )
811
811
ropeFreqScale = Param (Params ._dummy (), "ropeFreqScale" ,
@@ -837,7 +837,7 @@ class HasLlamaCppProperties:
837
837
typeConverter = TypeConverters .toString )
838
838
# Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
839
839
#
840
- # - UNSPECIFIED : Don't use any scaling
840
+ # - NONE : Don't use any scaling
841
841
# - LINEAR: Linear scaling
842
842
# - YARN: YaRN RoPE scaling
843
843
ropeScalingType = Param (Params ._dummy (), "ropeScalingType" ,
@@ -848,26 +848,28 @@ class HasLlamaCppProperties:
848
848
# - 0 NONE: Don't use any pooling
849
849
# - 1 MEAN: Mean Pooling
850
850
# - 2 CLS: CLS Pooling
851
+ # - 3 LAST: Last token pooling
852
+ # - 4 RANK: For reranked models
851
853
poolingType = Param (Params ._dummy (), "poolingType" ,
852
854
"Set the pooling type for embeddings, use model default if unspecified" ,
853
855
typeConverter = TypeConverters .toString )
854
856
modelDraft = Param (Params ._dummy (), "modelDraft" , "Set the draft model for speculative decoding" ,
855
857
typeConverter = TypeConverters .toString )
856
858
modelAlias = Param (Params ._dummy (), "modelAlias" , "Set a model alias" , typeConverter = TypeConverters .toString )
857
- lookupCacheStaticFilePath = Param (Params ._dummy (), "lookupCacheStaticFilePath" ,
858
- "Set path to static lookup cache to use for lookup decoding (not updated by generation)" ,
859
- typeConverter = TypeConverters .toString )
860
- lookupCacheDynamicFilePath = Param (Params ._dummy (), "lookupCacheDynamicFilePath" ,
861
- "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)" ,
862
- typeConverter = TypeConverters .toString )
859
+ # lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
860
+ # "Set path to static lookup cache to use for lookup decoding (not updated by generation)",
861
+ # typeConverter=TypeConverters.toString)
862
+ # lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
863
+ # "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
864
+ # typeConverter=TypeConverters.toString)
863
865
# loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
864
866
embedding = Param (Params ._dummy (), "embedding" , "Whether to load model with embedding support" ,
865
867
typeConverter = TypeConverters .toBoolean )
866
868
flashAttention = Param (Params ._dummy (), "flashAttention" , "Whether to enable Flash Attention" ,
867
869
typeConverter = TypeConverters .toBoolean )
868
- inputPrefixBos = Param (Params ._dummy (), "inputPrefixBos" ,
869
- "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string" ,
870
- typeConverter = TypeConverters .toBoolean )
870
+ # inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
871
+ # "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
872
+ # typeConverter=TypeConverters.toBoolean)
871
873
useMmap = Param (Params ._dummy (), "useMmap" ,
872
874
"Whether to use memory-map model (faster load but may increase pageouts if not using mlock)" ,
873
875
typeConverter = TypeConverters .toBoolean )
@@ -948,17 +950,17 @@ def setNThreads(self, nThreads: int):
948
950
"""Set the number of threads to use during generation"""
949
951
return self ._set (nThreads = nThreads )
950
952
951
- def setNThreadsDraft (self , nThreadsDraft : int ):
952
- """Set the number of threads to use during draft generation"""
953
- return self ._set (nThreadsDraft = nThreadsDraft )
953
+ # def setNThreadsDraft(self, nThreadsDraft: int):
954
+ # """Set the number of threads to use during draft generation"""
955
+ # return self._set(nThreadsDraft=nThreadsDraft)
954
956
955
957
def setNThreadsBatch (self , nThreadsBatch : int ):
956
958
"""Set the number of threads to use during batch and prompt processing"""
957
959
return self ._set (nThreadsBatch = nThreadsBatch )
958
960
959
- def setNThreadsBatchDraft (self , nThreadsBatchDraft : int ):
960
- """Set the number of threads to use during batch and prompt processing"""
961
- return self ._set (nThreadsBatchDraft = nThreadsBatchDraft )
961
+ # def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
962
+ # """Set the number of threads to use during batch and prompt processing"""
963
+ # return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
962
964
963
965
def setNCtx (self , nCtx : int ):
964
966
"""Set the size of the prompt context"""
@@ -976,17 +978,17 @@ def setNDraft(self, nDraft: int):
976
978
"""Set the number of tokens to draft for speculative decoding"""
977
979
return self ._set (nDraft = nDraft )
978
980
979
- def setNChunks (self , nChunks : int ):
980
- """Set the maximal number of chunks to process"""
981
- return self ._set (nChunks = nChunks )
981
+ # def setNChunks(self, nChunks: int):
982
+ # """Set the maximal number of chunks to process"""
983
+ # return self._set(nChunks=nChunks)
982
984
983
- def setNSequences (self , nSequences : int ):
984
- """Set the number of sequences to decode"""
985
- return self ._set (nSequences = nSequences )
985
+ # def setNSequences(self, nSequences: int):
986
+ # """Set the number of sequences to decode"""
987
+ # return self._set(nSequences=nSequences)
986
988
987
- def setPSplit (self , pSplit : float ):
988
- """Set the speculative decoding split probability"""
989
- return self ._set (pSplit = pSplit )
989
+ # def setPSplit(self, pSplit: float):
990
+ # """Set the speculative decoding split probability"""
991
+ # return self._set(pSplit=pSplit)
990
992
991
993
def setNGpuLayers (self , nGpuLayers : int ):
992
994
"""Set the number of layers to store in VRAM (-1 - use default)"""
@@ -1004,17 +1006,17 @@ def setMainGpu(self, mainGpu: int):
1004
1006
"""Set the main GPU that is used for scratch and small tensors."""
1005
1007
return self ._set (mainGpu = mainGpu )
1006
1008
1007
- def setTensorSplit (self , tensorSplit : List [float ]):
1008
- """Set how split tensors should be distributed across GPUs"""
1009
- return self ._set (tensorSplit = tensorSplit )
1009
+ # def setTensorSplit(self, tensorSplit: List[float]):
1010
+ # """Set how split tensors should be distributed across GPUs"""
1011
+ # return self._set(tensorSplit=tensorSplit)
1010
1012
1011
- def setGrpAttnN (self , grpAttnN : int ):
1012
- """Set the group-attention factor"""
1013
- return self ._set (grpAttnN = grpAttnN )
1013
+ # def setGrpAttnN(self, grpAttnN: int):
1014
+ # """Set the group-attention factor"""
1015
+ # return self._set(grpAttnN=grpAttnN)
1014
1016
1015
- def setGrpAttnW (self , grpAttnW : int ):
1016
- """Set the group-attention width"""
1017
- return self ._set (grpAttnW = grpAttnW )
1017
+ # def setGrpAttnW(self, grpAttnW: int):
1018
+ # """Set the group-attention width"""
1019
+ # return self._set(grpAttnW=grpAttnW)
1018
1020
1019
1021
def setRopeFreqBase (self , ropeFreqBase : float ):
1020
1022
"""Set the RoPE base frequency, used by NTK-aware scaling"""
@@ -1049,7 +1051,16 @@ def setDefragmentationThreshold(self, defragmentationThreshold: float):
1049
1051
return self ._set (defragmentationThreshold = defragmentationThreshold )
1050
1052
1051
1053
def setNumaStrategy (self , numaStrategy : str ):
1052
- """Set optimization strategies that help on some NUMA systems (if available)"""
1054
+ """Set optimization strategies that help on some NUMA systems (if available)
1055
+
1056
+ Possible values:
1057
+
1058
+ - DISABLED: No NUMA optimizations
1059
+ - DISTRIBUTE: spread execution evenly over all
1060
+ - ISOLATE: only spawn threads on CPUs on the node that execution started on
1061
+ - NUMA_CTL: use the CPU map provided by numactl
1062
+ - MIRROR: Mirrors the model across NUMA nodes
1063
+ """
1053
1064
numaUpper = numaStrategy .upper ()
1054
1065
numaStrategies = ["DISABLED" , "DISTRIBUTE" , "ISOLATE" , "NUMA_CTL" , "MIRROR" ]
1055
1066
if numaUpper not in numaStrategies :
@@ -1060,13 +1071,36 @@ def setNumaStrategy(self, numaStrategy: str):
1060
1071
return self ._set (numaStrategy = numaStrategy )
1061
1072
1062
1073
def setRopeScalingType (self , ropeScalingType : str ):
1063
- """Set the RoPE frequency scaling method, defaults to linear unless specified by the model"""
1064
- return self ._set (ropeScalingType = ropeScalingType )
1074
+ """Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
1075
+
1076
+ Possible values:
1077
+
1078
+ - NONE: Don't use any scaling
1079
+ - LINEAR: Linear scaling
1080
+ - YARN: YaRN RoPE scaling
1081
+ """
1082
+ ropeScalingTypeUpper = ropeScalingType .upper ()
1083
+ ropeScalingTypes = ["NONE" , "LINEAR" , "YARN" ]
1084
+ if ropeScalingTypeUpper not in ropeScalingTypes :
1085
+ raise ValueError (
1086
+ f"Invalid RoPE scaling type: { ropeScalingType } . "
1087
+ + f"Valid values are: { ropeScalingTypes } "
1088
+ )
1089
+ return self ._set (ropeScalingType = ropeScalingTypeUpper )
1065
1090
1066
1091
def setPoolingType (self , poolingType : str ):
1067
- """Set the pooling type for embeddings, use model default if unspecified"""
1092
+ """Set the pooling type for embeddings, use model default if unspecified
1093
+
1094
+ Possible values:
1095
+
1096
+ - 0 NONE: Don't use any pooling
1097
+ - 1 MEAN: Mean Pooling
1098
+ - 2 CLS: CLS Pooling
1099
+ - 3 LAST: Last token pooling
1100
+ - 4 RANK: For reranked models
1101
+ """
1068
1102
poolingTypeUpper = poolingType .upper ()
1069
- poolingTypes = ["NONE" , "MEAN" , "CLS" , "LAST" ]
1103
+ poolingTypes = ["NONE" , "MEAN" , "CLS" , "LAST" , "RANK" ]
1070
1104
if poolingTypeUpper not in poolingTypes :
1071
1105
raise ValueError (
1072
1106
f"Invalid pooling type: { poolingType } . "
@@ -1082,13 +1116,13 @@ def setModelAlias(self, modelAlias: str):
1082
1116
"""Set a model alias"""
1083
1117
return self ._set (modelAlias = modelAlias )
1084
1118
1085
- def setLookupCacheStaticFilePath (self , lookupCacheStaticFilePath : str ):
1086
- """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
1087
- return self ._set (lookupCacheStaticFilePath = lookupCacheStaticFilePath )
1119
+ # def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
1120
+ # """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
1121
+ # return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
1088
1122
1089
- def setLookupCacheDynamicFilePath (self , lookupCacheDynamicFilePath : str ):
1090
- """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
1091
- return self ._set (lookupCacheDynamicFilePath = lookupCacheDynamicFilePath )
1123
+ # def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
1124
+ # """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
1125
+ # return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
1092
1126
1093
1127
def setEmbedding (self , embedding : bool ):
1094
1128
"""Whether to load model with embedding support"""
@@ -1098,9 +1132,9 @@ def setFlashAttention(self, flashAttention: bool):
1098
1132
"""Whether to enable Flash Attention"""
1099
1133
return self ._set (flashAttention = flashAttention )
1100
1134
1101
- def setInputPrefixBos (self , inputPrefixBos : bool ):
1102
- """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
1103
- return self ._set (inputPrefixBos = inputPrefixBos )
1135
+ # def setInputPrefixBos(self, inputPrefixBos: bool):
1136
+ # """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
1137
+ # return self._set(inputPrefixBos=inputPrefixBos)
1104
1138
1105
1139
def setUseMmap (self , useMmap : bool ):
1106
1140
"""Whether to use memory-map model (faster load but may increase pageouts if not using mlock)"""
@@ -1260,9 +1294,9 @@ def setTokenBias(self, tokenBias: Dict[str, float]):
1260
1294
"""Set token id bias"""
1261
1295
return self ._call_java ("setTokenBias" , tokenBias )
1262
1296
1263
- def setLoraAdapters (self , loraAdapters : Dict [str , float ]):
1264
- """Set LoRA adapters with their scaling factors"""
1265
- return self ._call_java ("setLoraAdapters" , loraAdapters )
1297
+ # def setLoraAdapters(self, loraAdapters: Dict[str, float]):
1298
+ # """Set LoRA adapters with their scaling factors"""
1299
+ # return self._call_java("setLoraAdapters", loraAdapters)
1266
1300
1267
1301
def getMetadata (self ):
1268
1302
"""Gets the metadata of the model"""
0 commit comments