feat: LLM - Support tuning of new text embedding models by migrating to the new v1.1.3 pipeline.

vertex-sdk-bot · copybara-github · commit 7fea75470842 · 2024-05-08T12:29:06.000-07:00
PiperOrigin-RevId: 631887159
diff --git a/tests/unit/aiplatform/test_language_models.py b/tests/unit/aiplatform/test_language_models.py
@@ -563,7 +563,7 @@ def reverse_string_2(s):""",
                     "parameterType": "STRING",
                 },
                 "base_model_version_id": {
-                    "defaultValue": "textembedding-gecko@001",
+                    "defaultValue": "text-embedding-004",
                     "description": "which base model to tune. This may be any stable\nnumbered version, for example `textembedding-gecko@001`.",
                     "isOptional": True,
                     "parameterType": "STRING",
@@ -578,17 +578,15 @@ def reverse_string_2(s):""",
                     "description": "the GCS path to the corpus data location.",
                     "parameterType": "STRING",
                 },
-                "iterations": {
-                    "defaultValue": 1000,
-                    "description": "the number of steps to perform fine-tuning.",
+                "encryption_spec_key_name": {
+                    "defaultValue": "",
                     "isOptional": True,
-                    "parameterType": "NUMBER_INTEGER",
+                    "parameterType": "STRING",
                 },
-                "location": {
-                    "defaultValue": "us-central1",
-                    "description": "GCP region to run the pipeline.",
+                "learning_rate_multiplier": {
+                    "defaultValue": 1.0,
                     "isOptional": True,
-                    "parameterType": "STRING",
+                    "parameterType": "NUMBER_DOUBLE",
                 },
                 "machine_type": {
                     "defaultValue": "n1-standard-16",
@@ -602,9 +600,10 @@ def reverse_string_2(s):""",
                     "isOptional": True,
                     "parameterType": "STRING",
                 },
-                "project": {
-                    "description": "user's project id.",
-                    "parameterType": "STRING",
+                "output_dimensionality": {
+                    "defaultValue": -1,
+                    "isOptional": True,
+                    "parameterType": "NUMBER_INTEGER",
                 },
                 "queries_path": {
                     "description": "the GCS path to the queries location.",
@@ -626,6 +625,12 @@ def reverse_string_2(s):""",
                     "description": "the GCS path to the train label data location.",
                     "parameterType": "STRING",
                 },
+                "train_steps": {
+                    "defaultValue": 1000,
+                    "description": "the number of steps to perform fine-tuning.",
+                    "isOptional": True,
+                    "parameterType": "NUMBER_INTEGER",
+                },
                 "validation_label_path": {
                     "defaultValue": "",
                     "description": "The GCS path to the validation label data location.",
@@ -2283,6 +2288,61 @@ def test_text_generation_response_repr(self):
         ["https://us-central1-kfp.pkg.dev/proj/repo/pack/latest"],
         indirect=True,
     )
+    @pytest.mark.parametrize(
+        "base_model_version_id,tune_args,expected_pipeline_args",
+        [  # Do not pass any optional parameters.
+            (
+                "textembedding-gecko@003",
+                dict(
+                    training_data="gs://bucket/training.tsv",
+                    corpus_data="gs://bucket/corpus.jsonl",
+                    queries_data="gs://bucket/queries.jsonl",
+                ),
+                dict(
+                    base_model_version_id="textembedding-gecko@003",
+                    train_label_path="gs://bucket/training.tsv",
+                    corpus_path="gs://bucket/corpus.jsonl",
+                    queries_path="gs://bucket/queries.jsonl",
+                    encryption_spec_key_name=_TEST_ENCRYPTION_KEY_NAME,
+                ),
+            ),
+            # Pass all optional parameters.
+            (
+                "text-multilingual-embedding-002",
+                dict(
+                    training_data="gs://bucket/training.tsv",
+                    corpus_data="gs://bucket/corpus.jsonl",
+                    queries_data="gs://bucket/queries.jsonl",
+                    test_data="gs://bucket/test.tsv",
+                    validation_data="gs://bucket/validation.tsv",
+                    tuned_model_location="us-central1",
+                    model_display_name="my-tuned-model",
+                    train_steps=30,
+                    batch_size=256,
+                    accelerator="NVIDIA_TESLA_V100",
+                    accelerator_count=1,
+                    machine_type="n1-highmem-16",
+                    task_type="DEFAULT",
+                ),
+                dict(
+                    train_steps=30,
+                    accelerator_type="NVIDIA_TESLA_V100",
+                    accelerator_count=1,
+                    machine_type="n1-highmem-16",
+                    base_model_version_id="text-multilingual-embedding-002",
+                    train_label_path="gs://bucket/training.tsv",
+                    corpus_path="gs://bucket/corpus.jsonl",
+                    queries_path="gs://bucket/queries.jsonl",
+                    test_label_path="gs://bucket/test.tsv",
+                    batch_size=256,
+                    model_display_name="my-tuned-model",
+                    validation_label_path="gs://bucket/validation.tsv",
+                    encryption_spec_key_name=_TEST_ENCRYPTION_KEY_NAME,
+                    task_type="DEFAULT",
+                ),
+            ),
+        ],
+    )
     def test_tune_text_embedding_model(
         self,
         mock_pipeline_service_create,
@@ -2294,6 +2354,9 @@ def test_tune_text_embedding_model(
         mock_gcs_upload,
         mock_request_urlopen_gecko,
         mock_deploy_tuned_embedding_model,
+        tune_args,
+        expected_pipeline_args,
+        base_model_version_id,
     ):
         """Tests tuning the text embedding model."""
         aiplatform.init(
@@ -2309,23 +2372,23 @@ def test_tune_text_embedding_model(
             ),
         ):
             model = language_models.TextEmbeddingModel.from_pretrained(
-                "textembedding-gecko@003"
-            )
-            tuning_job = model.tune_model(
-                training_data="gs://bucket/training.tsv",
-                corpus_data="gs://bucket/corpus.jsonl",
-                queries_data="gs://bucket/queries.jsonl",
-                test_data="gs://bucket/test.tsv",
-                tuned_model_location="us-central1",
-                train_steps=10,
-                accelerator="NVIDIA_TESLA_A100",
+                base_model_version_id
             )
+            tuning_job = model.tune_model(**tune_args)
             call_kwargs = mock_pipeline_service_create.call_args[1]
-            pipeline_arguments = call_kwargs[
-                "pipeline_job"
-            ].runtime_config.parameter_values
-            assert pipeline_arguments["iterations"] == 10
-            assert pipeline_arguments["accelerator_type"] == "NVIDIA_TESLA_A100"
+            pipeline_arguments = dict(
+                call_kwargs["pipeline_job"].runtime_config.parameter_values
+            )
+
+            if (
+                "model_display_name" not in tune_args
+                and "model_display_name" in pipeline_arguments
+            ):
+                # This is automatically generated from some params, so don't
+                # check it.
+                del pipeline_arguments["model_display_name"]
+
+            assert pipeline_arguments == expected_pipeline_args
 
             # Testing the tuned model
             tuned_model = tuning_job.deploy_tuned_model()
diff --git a/vertexai/_model_garden/_model_garden_models.py b/vertexai/_model_garden/_model_garden_models.py
@@ -39,8 +39,10 @@
     "chat-bison-32k": "https://us-kfp.pkg.dev/ml-pipeline/large-language-model-pipelines/tune-large-chat-model/v3.0.0",
     "codechat-bison": "https://us-kfp.pkg.dev/ml-pipeline/large-language-model-pipelines/tune-large-chat-model/v3.0.0",
     "codechat-bison-32k": "https://us-kfp.pkg.dev/ml-pipeline/large-language-model-pipelines/tune-large-chat-model/v3.0.0",
-    "textembedding-gecko": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.2",
-    "textembedding-gecko-multilingual": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.2",
+    "textembedding-gecko": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.3",
+    "textembedding-gecko-multilingual": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.3",
+    "text-embedding-004": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.3",
+    "text-multilingual-embedding-002": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.3",
 }
 
 _LOGGER = base.Logger(__name__)
diff --git a/vertexai/language_models/_language_models.py b/vertexai/language_models/_language_models.py
@@ -414,20 +414,24 @@ def _tune_model(
             model_id=self._model_id,
             schema_to_class_map={self._INSTANCE_SCHEMA_URI: type(self)},
         )
-        if model_info.tuning_pipeline_uri.startswith(
-            "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model"
-        ):
-            train_steps = tuning_parameters.pop("train_steps", None)
-            if train_steps:
-                tuning_parameters["iterations"] = train_steps
+        if _is_text_embedding_tuning_pipeline(model_info.tuning_pipeline_uri):
             tunable_base_model_id = self._model_id.rpartition("/")[-1]
             tuning_parameters["base_model_version_id"] = tunable_base_model_id
         else:
             tuning_parameters["large_model_reference"] = model_info.tuning_model_id
-            if aiplatform_initializer.global_config.encryption_spec_key_name:
-                tuning_parameters[
-                    "encryption_spec_key_name"
-                ] = aiplatform_initializer.global_config.encryption_spec_key_name
+            tuning_parameters.update(
+                {
+                    "project": aiplatform_initializer.global_config.project,
+                    # TODO(b/275444096): Remove the explicit location once tuning
+                    # can happen in all regions.
+                    # "location": aiplatform_initializer.global_config.location,
+                    "location": tuned_model_location,
+                }
+            )
+        if aiplatform_initializer.global_config.encryption_spec_key_name:
+            tuning_parameters[
+                "encryption_spec_key_name"
+            ] = aiplatform_initializer.global_config.encryption_spec_key_name
 
         if not model_info.tuning_pipeline_uri:
             raise RuntimeError(f"The {self._model_id} model does not support tuning")
@@ -3890,6 +3894,12 @@ def _maybe_upload_training_data(
         )
 
 
+def _is_text_embedding_tuning_pipeline(pipeline_uri: str) -> bool:
+    return pipeline_uri.startswith(
+        "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model"
+    )
+
+
 def _launch_tuning_job(
     training_data: Union[str, "pandas.core.frame.DataFrame"],
     model_id: str,
@@ -3931,16 +3941,9 @@ def _launch_tuning_job(
         model_display_name = name[:max_display_name_length]
 
     pipeline_arguments = {
-        "project": aiplatform_initializer.global_config.project,
-        # TODO(b/275444096): Remove the explicit location once tuning can happen in all regions
-        # "location": aiplatform_initializer.global_config.location,
-        "location": tuned_model_location,
         "model_display_name": model_display_name,
     }
-
-    if tuning_pipeline_uri.startswith(
-        "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model"
-    ):
+    if _is_text_embedding_tuning_pipeline(tuning_pipeline_uri):
         pipeline_arguments["train_label_path"] = training_data_path
     elif training_data_path.startswith("gs://"):
         pipeline_arguments["dataset_uri"] = training_data_path