Evaluating LlamaIndex
Quick Summary
DeepEval integrates nicely with LlamaIndex's ResponseEvaluator class. Below is an example of the factual consistency documentation.
from llama_index.response.schema import Response
from typing import List
from llama_index.schema import Document
from deepeval.metrics import HallucinationMetric
from llama_index import (
    TreeIndex,
    VectorStoreIndex,
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
    Response,
)
from llama_index.llms import OpenAI
from llama_index.evaluation import ResponseEvaluator
import os
import openai
api_key = "sk-XXX"
openai.api_key = api_key
gpt4 = OpenAI(temperature=0, model="gpt-4", api_key=api_key)
service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)
evaluator_gpt4 = ResponseEvaluator(service_context=service_context_gpt4)
Getting a lLamaHub Loader
from llama_index import download_loader
WikipediaReader = download_loader("WikipediaReader")
loader = WikipediaReader()
documents = loader.load_data(pages=['Tokyo'])
tree_index = TreeIndex.from_documents(documents=documents)
vector_index = VectorStoreIndex.from_documents(
    documents, service_context=service_context_gpt4
)
We then build an evaluator based on the BaseEvaluator class that requires an evaluate method.
In this example, we show you how to write a factual consistency check.
from deepeval.test_case import LLMTestCase
from deepeval.metrics import HallucinationMetric
class HallucinationResponseEvaluator:
    def get_context(self, response: Response) -> List[Document]:
        """Get context information from given Response object using source nodes.
        Args:
            response (Response): Response object from an index based on the query.
        Returns:
            List of Documents of source nodes information as context information.
        """
        context = []
        for context_info in response.source_nodes:
            context.append(Document(text=context_info.node.get_content()))
        return context
    def evaluate(self, response: Response) -> str:
        # Evaluate factual consistency metrics
        answer = str(response)
        metric = HallucinationMetric()
        context = self.get_context(response)
        test_case = LLMTestCase(input="This is an example input", context=context, actual_output=answer)
        score = metric.measure(test_case=test_case)
        if metric.is_successful():
            return "YES"
        else:
            return "NO"
evaluator = HallucinationResponseEvaluator()
You can then evaluate as such:
query_engine = tree_index.as_query_engine()
response = query_engine.query("How did Tokyo get its name?")
eval_result = evaluator.evaluate(response)