From bbde919a64dbcf1d8a8ebb7dddc3f52e469b2e53 Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Sun, 13 Aug 2023 11:06:41 +0800 Subject: [PATCH] update example --- ...wen_doc_search_QA_based_on_dashscope.ipynb | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/examples/pytorch/application/qwen_doc_search_QA_based_on_dashscope.ipynb b/examples/pytorch/application/qwen_doc_search_QA_based_on_dashscope.ipynb index 475fc48e..f20437ac 100644 --- a/examples/pytorch/application/qwen_doc_search_QA_based_on_dashscope.ipynb +++ b/examples/pytorch/application/qwen_doc_search_QA_based_on_dashscope.ipynb @@ -55,23 +55,24 @@ "from dashscope import TextEmbedding\n", "from dashvector import Client, Doc\n", "\n", - "# get env variable from .env, please make sure add DASHSCOPE_KEY in .env\n", + "# get env variable from .env\n", + "# please make sure DASHSCOPE_KEY is defined in .env\n", "load_dotenv()\n", - "api_key = os.getenv('DASHSCOPE_KEY')\n", - "dashscope.api_key = api_key\n", + "dashscope.api_key = os.getenv('DASHSCOPE_KEY')\n", "\n", - "# initialize dashvector for embedding's indexing and searching\n", - "ds_client = Client(api_key=api_key)\n", + "\n", + "# initialize DashVector for embedding's indexing and searching\n", + "dashvector_client = Client(api_key='{your-dashvector-api-key}')\n", "\n", "# define collection name\n", "collection_name = 'news_embeddings'\n", "\n", "# delete if already exist\n", - "ds_client.delete(collection_name)\n", + "dashvector_client.delete(collection_name)\n", "\n", - "# create a collection with embedding size 1536\n", - "rsp = ds_client.create(collection_name, 1536)\n", - "collection = ds_client.get(collection_name)\n" + "# create a collection with embedding size of 1536\n", + "rsp = dashvector_client.create(collection_name, 1536)\n", + "collection = dashvector_client.get(collection_name)\n" ] }, { @@ -94,7 +95,7 @@ "outputs": [], "source": [ "def prepare_data_from_dir(path, size):\n", - " # prepare the data from a file folder in order to upsert to dashvector with a reasonable doc's size.\n", + " # prepare the data from a file folder in order to upsert to DashVector with a reasonable doc's size.\n", " batch_docs = []\n", " for file in os.listdir(path):\n", " with open(path + '/' + file, 'r', encoding='utf-8') as f:\n", @@ -127,7 +128,7 @@ "outputs": [], "source": [ "def prepare_data_from_file(path, size):\n", - " # prepare the data from file in order to upsert to dashvector with a reasonable doc's size.\n", + " # prepare the data from file in order to upsert to DashVector with a reasonable doc's size.\n", " batch_docs = []\n", " chunk_size = 12\n", " with open(path, 'r', encoding='utf-8') as f:\n", @@ -191,8 +192,8 @@ "id = 0\n", "dir_name = 'CEC-Corpus/raw corpus/allSourceText'\n", "\n", - "# indexing the raw docs with index to dashvector\n", - "collection = ds_client.get(collection_name)\n", + "# indexing the raw docs with index to DashVector\n", + "collection = dashvector_client.get(collection_name)\n", "\n", "# embedding api max batch size\n", "batch_size = 4 \n", @@ -225,7 +226,7 @@ "outputs": [], "source": [ "# check the collection status\n", - "collection = ds_client.get(collection_name)\n", + "collection = dashvector_client.get(collection_name)\n", "rsp = collection.stats()\n", "print(rsp)" ] @@ -249,11 +250,11 @@ }, "outputs": [], "source": [ - "def search_relevant_context(question, topk=1, client=ds_client):\n", + "def search_relevant_context(question, topk=1, client=dashvector_client):\n", " # query and recall the relevant information\n", " collection = client.get(collection_name)\n", "\n", - " # recall the top k similiar results from dashvector\n", + " # recall the top k similarity results from DashVector\n", " rsp = collection.query(generate_embeddings(question), output_fields=['raw'],\n", " topk=topk)\n", " return \"\".join([item.fields['raw'] for item in rsp.output])" @@ -333,7 +334,7 @@ }, "outputs": [], "source": [ - "# define a prompt template for the knowledge enhanced LLM generation\n", + "# define a prompt template for the vectorDB-enhanced LLM generation\n", "def answer_question(question, context):\n", " prompt = f'''请基于```内的内容回答问题。\"\n", "\t```\n", @@ -381,7 +382,7 @@ } ], "source": [ - "# test the case without knowledge\n", + "# test the case on plain LLM without vectorDB enhancement\n", "question = '清华博士发生了什么?'\n", "answer = answer_question(question, '')\n", "print(f'question: {question}\\n' f'answer: {answer}')"