Files
ollama/examples/langchain-typescript-selfqueryingretrieval/generateSource.ts
Matt Williams 9dd88dc040 Fixes for Bruces comments
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-11-06 14:16:24 -08:00

129 lines
4.4 KiB
TypeScript

import { Artwork, RawArtwork } from './types';
import { HuggingFaceTransformersEmbeddings } from 'langchain/embeddings/hf_transformers';
import { Chroma } from "langchain/vectorstores/chroma";
import { Document } from "langchain/document";
import { ChromaClient } from "chromadb";
const numberOfArtworks = 10;
// list of artists we are going to pull from the API
const artists = ["van Gogh", "Renoir", "Monet", "Picasso"]
const generateSource = async () => {
// Delete the existing vector store so that we don't get duplicate documents
await new ChromaClient().deleteCollection({
name: "artcollection",
});
const allartworkdocs = await getArt(artists);
// Create the vector store
const vectorStore = await Chroma.fromDocuments(allartworkdocs, embedding, { collectionName: "artcollection" });
console.log(`Created vector store with ${await vectorStore.collection?.count()} documents`);
}
const getArt = async (artists: string[]) => {
const artworks: Artwork[] = [];
const artistsWorkIds: number[] = []
for (const artist of artists) {
// First get the ids of the works by each artist
const thisIds = await fetchArtistWorkIds(artist);
console.log(`Fetching ${artist}`);
await (new Promise(r => setTimeout(r, 1000)));
artistsWorkIds.push(...thisIds);
};
// now get the actual artwork
const artwork = await fetchArtwork(artistsWorkIds);
return artwork
}
const fetchArtistWorkIds = async (artist: string): Promise<number[]> => {
const artistURL = `https://api.artic.edu/api/v1/artworks/search?q=${artist}&limit=${numberOfArtworks}`;
const response = await fetch(artistURL);
const json = await response.json();
const artistWorks: { id: number }[] = json.data;
return artistWorks.map((work) => work.id);
}
const embedding = new HuggingFaceTransformersEmbeddings({
modelName: "Xenova/all-MiniLM-L6-v2",
});
//Turns out there are some weird characters in the descriptions
const sanitize = (badstring: string): string => {
let goodstring = " ";
if (badstring !== null) {
goodstring = badstring
.replace(/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]>/gm, "")
.replace(/<\/a>/gm, "")
.replace(/<\/?em>/gm, "")
.replace(/[\u2018\u2019]/gm, "")
.replace(/[\u201C\u201D]/gm, "")
.replace(/[\u2013\u2014]/gm, "-")
.replace(/[\u2026]/gm, "...")
.replace(/[\u00A0]/gm, " ")
.replace(/[\u00AD]/gm, "-")
.replace(/[\u00B0]/gm, " degrees ")
.replace(/[\u00B1]/gm, " plus or minus ")
.replace(/[\u00B2]/gm, " squared ")
.replace(/[\u00B3]/gm, " cubed ")
.replace(/[\u00B4]/gm, "'")
.replace(/[\u00B5]/gm, " micro ")
.replace(/[\u00B6]/gm, " paragraph ")
.replace(/[\u00B7]/gm, " dot ")
.replace(/[\u00B8]/gm, ",")
.replace(/[\u00B9]/gm, " first ")
.replace(/[\u00BA]/gm, " degrees ")
.replace(/[\u00BB]/gm, ">>")
.replace(/[\u00BC]/gm, " 1/4 ")
.replace(/[\u00BD]/gm, " 1/2 ")
.replace(/[\uFB01]/gm, "fi")
.replace(/[\uFB02]/gm, "fl")
.replace(/[\uFB03]/gm, "ffi")
.replace(/[\uFB04]/gm, "ffl")
.replace(/[\uFB05]/gm, "ft")
.replace(/[\uFB06\uFB07\uFB08]/gm, "st")
.replace(/[\u00D7]/gm, "x")
.replace(/[\u00E8\u00E9]/gm, "e")
.replace(/[\u00F1]/gm, "n")
.replace(/[\u00F6]/gm, "o")
.replace(/[\u00F8]/gm, "o")
.replace(/[\u00FC]/gm, "u")
.replace(/[\u00FF]/gm, "y")
.replace(/[\u0101\u0103\u00E0]/gm, "a")
.replace(/[\u00C9]/gm, "E")
.replace(/<p>/gm, "")
.replace(/<\/p>/gm, "")
.replace(/\n/gm, "");
};
return goodstring;
}
const fetchArtwork = async (workids: number[]) => {
const docsarray = [];
const artworks: Artwork[] = [];
for await (const workid of workids) {
const artworkURL = `https://api.artic.edu/api/v1/artworks/${workid}`;
const response = await fetch(artworkURL);
const json = await response.json();
const artworkraw: RawArtwork = await json.data as RawArtwork;
const description = sanitize(artworkraw.description)
if (description !== " ") {
const doc = new Document({
pageContent: description,
metadata: {
title: sanitize(artworkraw.title),
date: artworkraw.date_end,
artistName: artworkraw.artist_title,
}
});
docsarray.push(doc);
console.log("------------------")
console.log(`${artworkraw.title} - ${artworkraw.artist_title}`);
}
}
return docsarray;
}
generateSource();