LangSmith Tutorial (No LangChain)
Here, we'll do a very basic walkthrough of how you can get started with LangSmith in TypeScript without LangChain.
First, we'll do some setup. Create a LangSmith API Key by navigating to the settings page in LangSmith, then create an .env file with values for the following variables, in the same directory as this notebook:
OPENAI_API_KEY=<YOUR OPENAI API KEY>
LANGCHAIN_TRACING_V2=true
LANGCHAIN_PROJECT='langsmith-wikirag-walkthrough'
LANGCHAIN_API_KEY=<YOUR LANGSMITH API KEY>
import "dotenv/config"; // Load env vars from .env file
import OpenAI from "npm:openai";
import wiki from "npm:wikipedia";
import { Client } from "langsmith";
import { traceable } from "langsmith/traceable";
import { wrapOpenAI } from "langsmith/wrappers";
// Wrap OpenAI in the LangSmith wrapper to get completions to log to the system
const openai = wrapOpenAI(new OpenAI());
const langsmith = new Client();
[Module: null prototype] { default: {} }
Create a Wikipedia Rag Pipeline that doesn't use LangChain
Here, we'll create a very simple RAG pipeline that:
- Generates a Wikpedia search query from the input question
- Retrieves relevant page summaries from Wikipedia based on the search query
- Answers the input question based on the context from the retrieval step
async function generateWikiSearch(input: { question: string }) {
const messages = [
{
role: "system" as const,
content:
"Generate a search query to pass into wikipedia to answer the user's question. Return only the search query and nothing more. This will be passed in directly to the Wikipedia search engine.",
},
{ role: "user" as const, content: input.question },
];
const chatCompletion = await openai.chat.completions.create({
model: "gpt-3.5-turbo",
messages: messages,
temperature: 0,
});
return chatCompletion.choices[0].message.content ?? "";
}
function convertDocs(results: Array<{ summary: string; url: string }>) {
// Convert docs to a format that LangSmith accepts (for nicer rendering)
return results.map((r) => ({
page_content: r.summary,
type: "Document",
metadata: { url: r.url },
}));
}
async function retrieve(input: { query: string; numDocuments: number }) {
const { results } = await wiki.search(input.query, { limit: 10 });
const finalResults: Array<{ summary: string; url: string }> = [];
for (const result of results) {
if (finalResults.length >= input.numDocuments) {
// Just return the top 2 pages for now
break;
}
const page = await wiki.page(result.title, { autoSuggest: false });
const summary = await page.summary();
finalResults.push({
summary: summary.extract,
url: page.fullurl,
});
}
return convertDocs(finalResults);
}
async function generateAnswer(input: { question: string; context: string }) {
const messages = [
{
role: "system" as const,
content: `Answer the user's question based only on the content below:\n\n${input.context}`,
},
{ role: "user" as const, content: input.question },
];
const chatCompletion = await openai.chat.completions.create({
model: "gpt-3.5-turbo",
messages: messages,
temperature: 0,
});
return chatCompletion.choices[0].message.content ?? "";
}
const traceGenerateWikiSearch = traceable(generateWikiSearch);
const traceRetrieve = traceable(retrieve, {
name: "Retrieve Wiki",
run_type: "retriever",
});
const traceGenerateAnswer = traceable(generateAnswer);
const traceRagPipeline = traceable(
async ({ question }, numDocuments: number = 2) => {
const query = await traceGenerateWikiSearch({ question });
const retrieverResults = await traceRetrieve({ query, numDocuments });
const context = retrieverResults
.map((result) => result.page_content)
.join("\n\n");
const answer = await traceGenerateAnswer({ question, context });
return answer;
},
{ name: "Wiki RAG Pipeline", run_type: "chain" }
);
await traceRagPipeline({
question: "When was the Apple Vision Pro released in the US?",
});
[32m"The Apple Vision Pro was released in the United States on February 2, 2024."[39m
Run the pipeline on some test cases
Before deploying to an initial set of users, it's often helpful to create a test set of a few examples, then run your pipeline on the test set.
LangSmith makes it easy to run custom evaluations (both LLM and heuristic-based) to score the results.
// Create a dataset to be used for testing using the LangSmith client
const examples = [
[
"When was the Apple Vision Pro released in the US?",
"The Apple Vision Pro was released in the United States on February 2, 2024.",
],
[
"What is LangChain?",
"LangChain is an open-source framework for building applications using large language models.",
],
[
"Who is the chairman of OpenAI?",
"Bret Taylor is the chairman of OpenAI"
],
];
const datasetName = "Wikipedia RAG Pipeline";
const dataset = await langsmith.createDataset(datasetName);
await Promise.all(
examples.map(async ([question, answer]) => {
await langsmith.createExample(
{ question },
{ answer },
{ datasetId: dataset.id }
);
})
);
[ [90mundefined[39m, [90mundefined[39m, [90mundefined[39m ]
// Run a set of tests on the dataset and compare them in LangSmith
// First, set up evaluators to run against the test results
import type { RunEvalType, RunEvaluatorLike } from "langchain/smith";
import { runOnDataset, Criteria, LabeledCriteria } from "langchain/smith";
// An illustrative custom evaluator example
const containsOpenAI: RunEvaluatorLike = async ({
run,
example,
input,
prediction,
reference,
}) => {
return {
key: "contains_openai",
score: prediction.output.includes("OpenAI"),
};
};
const evaluators: RunEvalType[] = [
// LangChain's built-in evaluators
Criteria("conciseness"),
LabeledCriteria("correctness"),
// Custom evaluators can be user-defined RunEvaluator's
// or a compatible function
containsOpenAI,
];
// Use `runOnDataset` to run the pipeline against examples in the Dataset
await runOnDataset(traceRagPipeline, datasetName, {
evaluators,
projectName: "test-2-documents-demo",
});
Predicting: ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ 100.00% | 3/3
Completed Running Evaluators: ▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░░░░░░░░░░░░░░░░░░ 33.33% | 1/3
Running Evaluators: ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░░░░ 66.67% | 2/3
Running Evaluators: ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ 100.00% | 3/3
{ projectName: [32m"test-2-documents-demo"[39m, results: { [32m"19c3732e-4347-4c97-bf05-57026783245e"[39m: { execution_time: [33m1758[39m, feedback: [ { id: [32m"81fa0f21-9bc7-4598-8d5a-47ba2225b648"[39m, run_id: [32m"7e11421d-cde6-4e77-8e28-27b5c260ad66"[39m, key: [32m"contains_openai"[39m, score: [33mtrue[39m, value: [90mundefined[39m, correction: [90mundefined[39m, comment: [90mundefined[39m, feedback_source: [36m[Object][39m }, { id: [32m"f43dbafe-30a4-44e9-86c7-2c1eddb2989f"[39m, run_id: [32m"7e11421d-cde6-4e77-8e28-27b5c260ad66"[39m, key: [32m"conciseness"[39m, score: [33m1[39m, value: [32m"Y"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this task is conciseness. This means the submission should be brief, to the point,"[39m... 319 more characters, feedback_source: [36m[Object][39m }, { id: [32m"64c28f8b-bcc0-46fd-9010-18f4b24cef81"[39m, run_id: [32m"7e11421d-cde6-4e77-8e28-27b5c260ad66"[39m, key: [32m"correctness"[39m, score: [33m0[39m, value: [32m"N"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this task is the correctness of the submission. This means the submitted answer sh"[39m... 532 more characters, feedback_source: [36m[Object][39m } ], run_id: [32m"7e11421d-cde6-4e77-8e28-27b5c260ad66"[39m }, [32m"b846e5ef-2265-42cd-9c50-4d0c7ec61376"[39m: { execution_time: [33m1478[39m, feedback: [ { id: [32m"71adc217-d9a4-4c11-bc6b-91d6900befd6"[39m, run_id: [32m"efada762-00c9-4a4e-b003-7d86c0ccdcf7"[39m, key: [32m"contains_openai"[39m, score: [33mfalse[39m, value: [90mundefined[39m, correction: [90mundefined[39m, comment: [90mundefined[39m, feedback_source: [36m[Object][39m }, { id: [32m"0ece59a7-be03-4656-9d00-340dc2b1f4e3"[39m, run_id: [32m"efada762-00c9-4a4e-b003-7d86c0ccdcf7"[39m, key: [32m"conciseness"[39m, score: [33m1[39m, value: [32m"Y"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this assessment is conciseness. The submission is a single sentence that directly "[39m... 217 more characters, feedback_source: [36m[Object][39m }, { id: [32m"756599ba-a798-4552-820b-dd94cdd729e0"[39m, run_id: [32m"efada762-00c9-4a4e-b003-7d86c0ccdcf7"[39m, key: [32m"correctness"[39m, score: [33m1[39m, value: [32m"Y"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this task is the correctness of the submission. The submission states that LangCha"[39m... 672 more characters, feedback_source: [36m[Object][39m } ], run_id: [32m"efada762-00c9-4a4e-b003-7d86c0ccdcf7"[39m }, [32m"aae4ce78-52fe-49f3-b945-f6f4a7c21dba"[39m: { execution_time: [33m1844[39m, feedback: [ { id: [32m"8b294dd7-ba6e-4b00-8f3d-a19c6503ade9"[39m, run_id: [32m"b7447ce7-75c2-4623-8d90-f6fe2db6b97b"[39m, key: [32m"contains_openai"[39m, score: [33mfalse[39m, value: [90mundefined[39m, correction: [90mundefined[39m, comment: [90mundefined[39m, feedback_source: [36m[Object][39m }, { id: [32m"4e5442ae-5740-43e5-8f01-605cfc05a47d"[39m, run_id: [32m"b7447ce7-75c2-4623-8d90-f6fe2db6b97b"[39m, key: [32m"conciseness"[39m, score: [33m1[39m, value: [32m"Y"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this task is conciseness. This means the submission should be brief, to the point,"[39m... 292 more characters, feedback_source: [36m[Object][39m }, { id: [32m"5e05a648-4733-4886-82a8-d595ce14db9a"[39m, run_id: [32m"b7447ce7-75c2-4623-8d90-f6fe2db6b97b"[39m, key: [32m"correctness"[39m, score: [33m1[39m, value: [32m"Y"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this task is the correctness of the submission. This involves checking if the subm"[39m... 397 more characters, feedback_source: [36m[Object][39m } ], run_id: [32m"b7447ce7-75c2-4623-8d90-f6fe2db6b97b"[39m } } }
// Let's now execute a test of the pipeline where we retrieve 4 pages instead of 1
// Wrapper function with numDocuments set to 4
const traceRagPipelineFourDocuments = async ({ question }) => {
return traceRagPipeline({ question }, 4);
};
await runOnDataset(traceRagPipelineFourDocuments, datasetName, {
evaluators,
projectName: "test-4-documents-demo",
});
Predicting: ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ 100.00% | 3/3
Completed Running Evaluators: ▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░░░░░░░░░░░░░░░░░░ 33.33% | 1/3
Running Evaluators: ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░░░░ 66.67% | 2/3
Running Evaluators: ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ 100.00% | 3/3
{ projectName: [32m"test-4-documents-demo"[39m, results: { [32m"19c3732e-4347-4c97-bf05-57026783245e"[39m: { execution_time: [33m1927[39m, feedback: [ { id: [32m"6595171c-afff-4b43-ad02-47adbed7cc9a"[39m, run_id: [32m"7c23bee6-7333-47a0-ab45-bc46f8863eaa"[39m, key: [32m"contains_openai"[39m, score: [33mtrue[39m, value: [90mundefined[39m, correction: [90mundefined[39m, comment: [90mundefined[39m, feedback_source: [36m[Object][39m }, { id: [32m"8077c116-2713-456a-a82e-1ed32f48e1df"[39m, run_id: [32m"7c23bee6-7333-47a0-ab45-bc46f8863eaa"[39m, key: [32m"conciseness"[39m, score: [33m1[39m, value: [32m"Y"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this assessment is conciseness. This means the submission should be brief, to the "[39m... 318 more characters, feedback_source: [36m[Object][39m }, { id: [32m"4850f2fe-a6e0-4f5c-ac2b-b94cc2c3b7d3"[39m, run_id: [32m"7c23bee6-7333-47a0-ab45-bc46f8863eaa"[39m, key: [32m"correctness"[39m, score: [33m0[39m, value: [32m"N"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this task is the correctness of the submission. The submission should be correct, "[39m... 555 more characters, feedback_source: [36m[Object][39m } ], run_id: [32m"7c23bee6-7333-47a0-ab45-bc46f8863eaa"[39m }, [32m"b846e5ef-2265-42cd-9c50-4d0c7ec61376"[39m: { execution_time: [33m2265[39m, feedback: [ { id: [32m"ca3ecf2c-66c6-4b31-9329-93a9e2844009"[39m, run_id: [32m"d26226b2-7571-4506-8ee3-f527e3e9b9c7"[39m, key: [32m"contains_openai"[39m, score: [33mfalse[39m, value: [90mundefined[39m, correction: [90mundefined[39m, comment: [90mundefined[39m, feedback_source: [36m[Object][39m }, { id: [32m"7d4c3198-f17d-4f2e-b9f8-95fd83edadaa"[39m, run_id: [32m"d26226b2-7571-4506-8ee3-f527e3e9b9c7"[39m, key: [32m"conciseness"[39m, score: [33m1[39m, value: [32m"Y"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this task is conciseness. This means the submission should be brief, to the point,"[39m... 554 more characters, feedback_source: [36m[Object][39m }, { id: [32m"36123b72-cc5e-44ef-9e1b-b07b953898f5"[39m, run_id: [32m"d26226b2-7571-4506-8ee3-f527e3e9b9c7"[39m, key: [32m"correctness"[39m, score: [33m1[39m, value: [32m"Y"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this task is the correctness of the submission. The submission states that LangCha"[39m... 617 more characters, feedback_source: [36m[Object][39m } ], run_id: [32m"d26226b2-7571-4506-8ee3-f527e3e9b9c7"[39m }, [32m"aae4ce78-52fe-49f3-b945-f6f4a7c21dba"[39m: { execution_time: [33m2963[39m, feedback: [ { id: [32m"f4f2c883-feaa-4739-930d-4a45a8e26def"[39m, run_id: [32m"c7c478be-7770-4990-bccf-d3ba2e49e579"[39m, key: [32m"contains_openai"[39m, score: [33mfalse[39m, value: [90mundefined[39m, correction: [90mundefined[39m, comment: [90mundefined[39m, feedback_source: [36m[Object][39m }, { id: [32m"6c494270-e994-49c1-8121-3e168a6044f5"[39m, run_id: [32m"c7c478be-7770-4990-bccf-d3ba2e49e579"[39m, key: [32m"conciseness"[39m, score: [33m1[39m, value: [32m"Y"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this task is conciseness. This means the submission should be brief, to the point,"[39m... 457 more characters, feedback_source: [36m[Object][39m }, { id: [32m"9af5492c-d7b5-4104-b395-f35ba2455927"[39m, run_id: [32m"c7c478be-7770-4990-bccf-d3ba2e49e579"[39m, key: [32m"correctness"[39m, score: [33m1[39m, value: [32m"Y"[39m, correction: [90mundefined[39m, comment: [32m"The criterion for this task is the correctness of the submission. This involves checking if the subm"[39m... 394 more characters, feedback_source: [36m[Object][39m } ], run_id: [32m"c7c478be-7770-4990-bccf-d3ba2e49e579"[39m } } }
Tracing
Let's say you've deployed your application to production or to an initial set of users.
You can view traces of your application in LangSmith and drill down by various attributes to get statistics on a specific set of traces.
You can also attach feedback to your runs (such as through a thumbs up/down button) using the LangSmith client, and filter on traces with a certain feedback score in the UI.
// Here we execute the pipeline on a number of potential user inputs, simulating how users might interact with it.
import { pLimit } from "https://deno.land/x/p_limit@v1.0.0/mod.ts";
const limit = pLimit(5); // Add a concurrency limit to avoid getting rate limited by Open
async function runRagPipelines(questions: string[]): Promise<string[]> {
const promises = questions.map((question) =>
limit(() => traceRagPipeline({ question }))
);
try {
const results = await Promise.all(promises);
return results;
} catch (error) {
console.error("Error running RAG Pipelines:", error);
throw error;
}
}
// Example usage
const questions: string[] = [
"When was Sam Altman removed from OpenAI?",
"What is the significance of the James Webb Space Telescope's first images?",
"How did the international community respond to the crisis in Ukraine?",
"What is the status of the COVID-19 vaccine distribution worldwide?",
"What are the outcomes of the recent G7 summit?",
"Who are the leading figures in climate change activism today?",
"How many majors has Jannik Sinner won?",
];
await runRagPipelines(questions);