Core Building Blocks / AI Agent

Testing

Test agents with harnesses, scripted models, and deterministic tool responses.

Agents have two testing levels:

LevelAPIValidates
Handler testcreateAgentContextMock(...)Agent logic, model interaction, tool selection
Runtime testcreateAgentTestHarness(...)Full pipeline with real models or scripted responses

Handler test

Use the context mock to test your agent in isolation:

import { describe, test, expect, vi } from 'vitest'
import { safeBind } from '@purista/core'
import { createAgentContextMock } from '@purista/ai-harness'
import { supportAgent } from './supportAgent.js'
import { aiService } from '../aiService.js'

describe('supportAgent', () => {
  test('returns structured output', async () => {
    const { context } = createAgentContextMock(supportAgent, {
      payload: { query: 'How do I reset my password?' },
    })

    // Mock the model response
    context.harness.models.gpt4.object = vi.fn().mockResolvedValue({
      answer: 'Go to Settings > Security > Reset Password',
      confidence: 0.95,
      sources: ['knowledge-base-1'],
    })

    const handler = safeBind(supportAgent.getAgentFunction(), aiService)
    const result = await handler(context, { query: 'How do I reset my password?' }, {})

    expect(result.answer).toBe('Go to Settings > Security > Reset Password')
    expect(result.confidence).toBe(0.95)
  })

  test('calls tools when needed', async () => {
    const { context, stubs } = createAgentContextMock(supportAgent, {
      payload: { query: 'What is my order status?' },
    })

    // Mock model deciding to call a tool
    context.harness.models.gpt4.object = vi.fn()
      .mockResolvedValueOnce({ toolCall: 'getOrderHistory', parameters: { userId: 'user-1' } })
      .mockResolvedValueOnce({ answer: 'Your last order was delivered yesterday', confidence: 0.9 })

    stubs.service.OrderService['1'].getOrderHistory.mockResolvedValue({
      orders: [{ id: 'ord-1', status: 'delivered' }],
    })

    const handler = safeBind(supportAgent.getAgentFunction(), aiService)
    const result = await handler(context, { query: 'What is my order status?' }, {})

    expect(stubs.service.OrderService['1'].getOrderHistory).toHaveBeenCalledWith({ userId: 'user-1' })
    expect(result.answer).toContain('delivered')
  })
})

Scripted model harness

For deterministic tests, use a scripted model instead of a real AI:

import { createScriptedHarnessModel, createAgentContextMock } from '@purista/ai-harness'
import { safeBind } from '@purista/core'

test('deterministic agent behavior', async () => {
  const scriptedModel = createScriptedHarnessModel({
    responses: [
      {
        match: (prompt) => prompt.includes('reset password'),
        response: { answer: 'Click forgot password', confidence: 0.9 },
      },
      {
        match: (prompt) => prompt.includes('order status'),
        response: { toolCall: 'getOrderHistory', parameters: { userId: 'user-1' } },
      },
    ],
  })

  const { context } = createAgentContextMock(supportAgent, {
    payload: { query: 'How do I reset my password?' },
  })

  context.harness.models.gpt4 = scriptedModel

  const handler = safeBind(supportAgent.getAgentFunction(), aiService)
  const result = await handler(context, { query: 'How do I reset my password?' }, {})

  expect(result.answer).toBe('Click forgot password')
})

Testing tool selection

describe('tool selection', () => {
  test('selects correct tool for billing query', async () => {
    const { context } = createAgentContextMock(supportAgent, {
      payload: { query: 'Why was I charged $50?' },
    })

    const toolCalls: string[] = []
    context.harness.models.gpt4.object = vi.fn().mockImplementation((prompt) => {
      if (prompt.includes('billing')) {
        toolCalls.push('checkBillingStatus')
        return { toolCall: 'checkBillingStatus', parameters: { userId: 'user-1' } }
      }
      return { answer: 'Here is your billing info', confidence: 0.9 }
    })

    const handler = safeBind(supportAgent.getAgentFunction(), aiService)
    await handler(context, { query: 'Why was I charged $50?' }, {})

    expect(toolCalls).toContain('checkBillingStatus')
  })
})

Testing session state

describe('conversation mode', () => {
  test('remembers previous context', async () => {
    const { context } = createAgentContextMock(supportAgent, {
      payload: { query: 'What about premium?', conversationId: 'conv-1' },
    })

    // Simulate previous state in session
    context.session = {
      conversationId: 'conv-1',
      history: [
        { role: 'user', content: 'Tell me about plans' },
        { role: 'assistant', content: 'We have Basic and Premium' },
      ],
    }

    context.harness.models.gpt4.object = vi.fn().mockImplementation((prompt) => {
      // The prompt should include conversation history
      expect(prompt).toContain('Basic and Premium')
      return { answer: 'Premium includes all features', confidence: 0.95 }
    })

    const handler = safeBind(supportAgent.getAgentFunction(), aiService)
    const result = await handler(context, { query: 'What about premium?' }, {})

    expect(result.answer).toBe('Premium includes all features')
  })
})

Testing streaming

describe('streaming mode', () => {
  test('emits chunks during reasoning', async () => {
    const { context, writer } = createAgentContextMock(supportAgent, {
      payload: { query: 'Explain quantum computing' },
      streaming: true,
    })

    context.harness.models.gpt4.text = vi.fn().mockImplementation(async function* () {
      yield 'Quantum computing '
      yield 'uses qubits '
      yield 'instead of bits.'
    })

    const handler = safeBind(supportAgent.getAgentFunction(), aiService)
    await handler(context, { query: 'Explain quantum computing' }, {}, writer)

    expect(writer.write).toHaveBeenCalledTimes(3)
    expect(writer.close).toHaveBeenCalled()
  })
})

Runtime test

Use the harness when you need to verify the full agent pipeline:

import { createAgentTestHarness, createScriptedHarnessModel } from '@purista/ai-harness'

test('full runtime: processes agent request', async () => {
  const harness = await createAgentTestHarness(aiService, supportAgent, {
    models: {
      gpt4: createScriptedHarnessModel({
        responses: [
          {
            match: () => true,
            response: { answer: 'Test response', confidence: 0.9 },
          },
        ],
      }),
    },
  })

  try {
    const result = await harness.run({
      payload: { query: 'Test query' },
    })

    expect(result.answer).toBe('Test response')
    expect(result.confidence).toBe(0.9)
  } finally {
    await harness.destroy()
  }
})

The runtime harness:

  • Starts a real service with the agent registered
  • Can use real models or scripted responses
  • Runs the full pipeline including guards and tool routing
  • Requires await harness.destroy() to clean up

Testing workflows

import { createAgentTestHarness, createScriptedHarnessModel } from '@purista/ai-harness'

test('workflow: routes billing queries correctly', async () => {
  const harness = await createAgentTestHarness(aiService, supportWorkflow, {
    models: {
      gpt4: createScriptedHarnessModel({
        responses: [
          {
            match: (prompt) => prompt.includes('Classify'),
            response: { category: 'billing' },
          },
          {
            match: (prompt) => prompt.includes('billing status'),
            response: { status: 'paid', amount: 50 },
          },
          {
            match: (prompt) => prompt.includes('Summarize'),
            response: { answer: 'You paid $50', escalated: false },
          },
        ],
      }),
    },
  })

  try {
    const result = await harness.run({
      payload: { query: 'Why was I charged $50?' },
    })

    expect(result.escalated).toBe(false)
    expect(result.answer).toContain('$50')
  } finally {
    await harness.destroy()
  }
})

Which level should you use?

ScenarioRecommended level
Model prompt logicHandler test
Tool selectionHandler test
Output schema validationHandler test
Session stateHandler test
Streaming behaviorHandler test
Guard integrationRuntime test
Workflow routingRuntime test
Full pipeline with modelsRuntime test