open-compass · bittersweet1999 · Jun 6, 2024 · Jun 6, 2024 · Jun 7, 2024 · Jul 1, 2024
diff --git a/README.md b/README.md
@@ -193,6 +193,52 @@ After ensuring that OpenCompass is installed correctly according to the above st
   opencompass ./configs/eval_api_demo.py
   ```
 
+- Subjective evaluation
+
+  When conducting subjective evaluations, in addition to specifying the models and datasets, it is also necessary to specify a robust model as the judgemodel, such as using API models like GPT4 or open-source large models like Qwen-72B-Instruct.
+  Prepare the following Python script for parameter specification, where you should replace models, datasets, and judgemodels according to your own needs:
+
+  ```
+  from mmengine.config import read_base
+  with read_base():
+      from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
+      from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
+      from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct
+      from opencompass.configs.models.qwen.lmdeploy_qwen2_72b_instruct import models as lmdeploy_qwen2_72b_instruct
+
+  from opencompass.partitioners import NaivePartitioner
-  from opencompass.partitioners import NaivePartitioner
+  from opencompass.partitioners import NumWorkerPartitioner
-  from opencompass.partitioners import NaivePartitioner
+  from opencompass.partitioners import NumWorkerPartitioner
+  from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+  from opencompass.runners import LocalRunner
+  from opencompass.tasks import OpenICLInferTask
+  from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+  from opencompass.summarizers import SubjectiveSummarizer
+
+  ### Base Configuration
+  models = lmdeploy_qwen2_7b_instruct
+  datasets = [*alignbench_datasets, *alpacav2_datasets]
+  judge_models = lmdeploy_qwen2_72b_instruct
+  work_dir = 'outputs/subjective/'
+
+
+  ### Advanced Configuration
+  infer = dict(
+      partitioner=dict(type=NaivePartitioner),
-      partitioner=dict(type=NaivePartitioner),
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
-      partitioner=dict(type=NaivePartitioner),
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+      runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)),
+  )
+  eval = dict(
+      partitioner=dict(type=SubjectiveNaivePartitioner, models=models, judge_models=judge_models,),
+      runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
+  )
+  summarizer = dict(type=SubjectiveSummarizer, function='subjective')
+  ```
+
+  After setting your config python file, run it!
+
+  ```bash
+  # Python scripts
+  opencompass ./configs/eval_api_demo.py
+  ```
+
 - Accelerated Evaluation
 
   Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -189,6 +189,52 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
   opencompass  ./configs/eval_api_demo.py
   ```
 
+- ### 主观评测
+
+  进行主观评测时，除了需要指定models和datasets，还需要指定一个强有力的模型作为judgemodel，比如使用API模型如GPT4或开源大模型Qwen-72B-Instruct
+  准备如下python脚本进行参数指定，其中的models，datasets和judgemodels根据自己的需求进行替换：
+
+  ```
+  from mmengine.config import read_base
+  with read_base():
+      from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
+      from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
+      from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct
+      from opencompass.configs.models.qwen.lmdeploy_qwen2_72b_instruct import models as lmdeploy_qwen2_72b_instruct
+
+  from opencompass.partitioners import NaivePartitioner
-  from opencompass.partitioners import NaivePartitioner
+  from opencompass.partitioners import NumWorkerPartitioner
-  from opencompass.partitioners import NaivePartitioner
+  from opencompass.partitioners import NumWorkerPartitioner
+  from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+  from opencompass.runners import LocalRunner
+  from opencompass.tasks import OpenICLInferTask
+  from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+  from opencompass.summarizers import SubjectiveSummarizer
+
+  ### 基础设定
+  models = lmdeploy_qwen2_7b_instruct
+  datasets = [*alignbench_datasets, *alpacav2_datasets]
+  judge_models = lmdeploy_qwen2_72b_instruct
+  work_dir = 'outputs/subjective/'
+
+
+  ### 进阶设定(一般情况下默认即可)
+  infer = dict(
+      partitioner=dict(type=NaivePartitioner),
-      partitioner=dict(type=NaivePartitioner),
+      partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
-      partitioner=dict(type=NaivePartitioner),
+      partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+      runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)),
+  )
+  eval = dict(
+      partitioner=dict(type=SubjectiveNaivePartitioner, models=models, judge_models=judge_models,),
+      runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
+  )
+  summarizer = dict(type=SubjectiveSummarizer, function='subjective')
+  ```
+
+  随后运行
+
+  ```bash
+  # Python scripts
+  opencompass your_config_name.py
+  ```
+
 - ### 推理后端
 
   另外，如果您想使用除 HuggingFace 之外的推理后端来进行加速评估，比如 LMDeploy 或 vLLM，可以通过以下命令进行。请确保您已经为所选的后端安装了必要的软件包，并且您的模型支持该后端的加速推理。更多信息，请参阅关于推理加速后端的文档 [这里](docs/zh_cn/advanced_guides/accelerator_intro.md)。以下是使用 LMDeploy 的示例：

diff --git a/configs/eval_subjective.py b/configs/eval_subjective.py
@@ -10,6 +10,8 @@
     from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import wildbench_datasets
     from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
     from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_72b_instruct import models as lmdeploy_qwen2_72b_instruct
+
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
@@ -67,18 +69,20 @@
 # -------------Evalation Stage ----------------------------------------
 
 ## ------------- JudgeLLM Configuration
-judge_models = [dict(
-    abbr='GPT4-Turbo',
-    type=OpenAI,
-    path='gpt-4-1106-preview',
-    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    meta_template=api_meta_template,
-    query_per_second=16,
-    max_out_len=2048,
-    max_seq_len=2048,
-    batch_size=8,
-    temperature=0,
-)]
+# judge_models = [dict(
+#     abbr='GPT4-Turbo',
+#     type=OpenAI,
+#     path='gpt-4-1106-preview',
+#     key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+#     meta_template=api_meta_template,
+#     query_per_second=16,
+#     max_out_len=2048,
+#     max_seq_len=2048,
+#     batch_size=8,
+#     temperature=0,
+# )]
+
+judge_models = lmdeploy_qwen2_72b_instruct
 
 ## ------------- Evaluation Configuration
 eval = dict(

diff --git a/docs/en/advanced_guides/subjective_evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md
@@ -33,9 +33,9 @@ Similar to objective evaluation, import the models and datasets that need to be
 
 ```
 with read_base():
-    from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
-    from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
-    from .models.qwen.hf_qwen_7b import models
+    from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
+    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
+    from opencompass.configs.models.qwen.hf_qwen_7b import models
 ```
 
 It is worth noting that since the model setup parameters for subjective evaluation are often different from those for objective evaluation, it often requires setting up `do_sample` for inference instead of `greedy`. You can modify the relevant parameters in the configuration file as needed, for example:

diff --git a/docs/zh_cn/advanced_guides/subjective_evaluation.md b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@@ -33,9 +33,9 @@
 
 ```
 with read_base():
-    from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
-    from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
-    from .models.qwen.hf_qwen_7b import models
+    from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
+    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
+    from opencompass.configs.models.qwen.hf_qwen_7b import models
 ```
 
 值得注意的是，由于主观评测的模型设置参数通常与客观评测不同，往往需要设置`do_sample`的方式进行推理而不是`greedy`，故可以在配置文件中自行修改相关参数，例如