Files
dokuwiki-plugin-aichat/cli.php
Andreas Gohr 22a3672c7d store embedding meta data on start
This way errors during the a new embedding run will not trigger a clear
index on retry
2024-07-22 12:58:30 +02:00

506 lines
16 KiB
PHP

<?php
use dokuwiki\Extension\CLIPlugin;
use dokuwiki\plugin\aichat\AbstractCLI;
use dokuwiki\plugin\aichat\Chunk;
use dokuwiki\plugin\aichat\ModelFactory;
use dokuwiki\Search\Indexer;
use splitbrain\phpcli\Colors;
use splitbrain\phpcli\Options;
use splitbrain\phpcli\TableFormatter;
/**
* DokuWiki Plugin aichat (CLI Component)
*
* @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
* @author Andreas Gohr <gohr@cosmocode.de>
*/
class cli_plugin_aichat extends AbstractCLI
{
/** @var helper_plugin_aichat */
protected $helper;
/** @inheritDoc */
protected function setup(Options $options)
{
parent::setup($options);
$options->setHelp(
'Manage and query the AI chatbot data. Please note that calls to your LLM provider will be made. ' .
'This may incur costs.'
);
$options->registerOption(
'model',
'Overrides the chat and rephrasing model settings and uses this model instead',
'',
'model'
);
$options->registerCommand(
'embed',
'Create embeddings for all pages. This skips pages that already have embeddings'
);
$options->registerOption(
'clear',
'Clear all existing embeddings before creating new ones',
'c',
false,
'embed'
);
$options->registerCommand('maintenance', 'Run storage maintenance. Refer to the documentation for details.');
$options->registerCommand('similar', 'Search for similar pages');
$options->registerArgument('query', 'Look up chunks similar to this query', true, 'similar');
$options->registerCommand('ask', 'Ask a question');
$options->registerArgument('question', 'The question to ask', true, 'ask');
$options->registerCommand('chat', 'Start an interactive chat session');
$options->registerCommand('models', 'List available models');
$options->registerCommand('info', 'Get Info about the vector storage and other stats');
$options->registerCommand('split', 'Split a page into chunks (for debugging)');
$options->registerArgument('page', 'The page to split', true, 'split');
$options->registerCommand('page', 'Check if chunks for a given page are available (for debugging)');
$options->registerArgument('page', 'The page to check', true, 'page');
$options->registerOption('dump', 'Dump the chunks', 'd', false, 'page');
$options->registerCommand('tsv', 'Create TSV files for visualizing at http://projector.tensorflow.org/' .
' Not supported on all storages.');
$options->registerArgument('vector.tsv', 'The vector file', false, 'tsv');
$options->registerArgument('meta.tsv', 'The meta file', false, 'tsv');
}
/** @inheritDoc */
protected function main(Options $options)
{
parent::main($options);
auth_setup(); // make sure ACLs are initialized
$model = $options->getOpt('model');
if ($model) {
$this->helper->updateConfig(
['chatmodel' => $model, 'rephasemodel' => $model]
);
}
switch ($options->getCmd()) {
case 'embed':
$this->createEmbeddings($options->getOpt('clear'));
break;
case 'maintenance':
$this->runMaintenance();
break;
case 'similar':
$this->similar($options->getArgs()[0]);
break;
case 'ask':
$this->ask($options->getArgs()[0]);
break;
case 'chat':
$this->chat();
break;
case 'models':
$this->models();
break;
case 'split':
$this->split($options->getArgs()[0]);
break;
case 'page':
$this->page($options->getArgs()[0], $options->getOpt('dump'));
break;
case 'info':
$this->showinfo();
break;
case 'tsv':
$args = $options->getArgs();
$vector = $args[0] ?? 'vector.tsv';
$meta = $args[1] ?? 'meta.tsv';
$this->tsv($vector, $meta);
break;
default:
echo $options->help();
}
}
/**
* @return void
*/
protected function showinfo()
{
$stats = [
'embed model' => (string) $this->helper->getEmbeddingModel(),
'rephrase model' => (string) $this->helper->getRephraseModel(),
'chat model' => (string) $this->helper->getChatModel(),
];
$stats = array_merge(
$stats,
$this->helper->getRunData(),
$this->helper->getStorage()->statistics()
);
$this->printTable($stats);
}
/**
* Print key value data as tabular data
*
* @param array $data
* @param int $level
* @return void
*/
protected function printTable($data, $level = 0)
{
$tf = new TableFormatter($this->colors);
foreach ($data as $key => $value) {
if (is_array($value)) {
echo $tf->format(
[$level * 2, 20, '*'],
['', $key, ''],
[Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE]
);
$this->printTable($value, $level + 1);
} else {
echo $tf->format(
[$level * 2, 20, '*'],
['', $key, $value],
[Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTGRAY]
);
}
}
}
/**
* Check chunk availability for a given page
*
* @param string $page
* @return void
*/
protected function page($page, $dump = false)
{
$indexer = new Indexer();
$pages = $indexer->getPages();
$pos = array_search(cleanID($page), $pages);
if ($pos === false) {
$this->error('Page not found');
return;
}
$storage = $this->helper->getStorage();
$chunks = $storage->getPageChunks($page, $pos * 100);
if ($chunks) {
$this->success('Found ' . count($chunks) . ' chunks');
if ($dump) {
echo json_encode($chunks, JSON_PRETTY_PRINT);
}
} else {
$this->error('No chunks found');
}
}
/**
* Split the given page into chunks and print them
*
* @param string $page
* @return void
* @throws Exception
*/
protected function split($page)
{
$chunks = $this->helper->getEmbeddings()->createPageChunks($page, 0);
foreach ($chunks as $chunk) {
echo $chunk->getText();
echo "\n";
$this->colors->ptln('--------------------------------', Colors::C_LIGHTPURPLE);
}
$this->success('Split into ' . count($chunks) . ' chunks');
}
/**
* Interactive Chat Session
*
* @return void
* @throws Exception
*/
protected function chat()
{
$history = [];
while ($q = $this->readLine('Your Question')) {
$this->helper->getChatModel()->resetUsageStats();
$this->helper->getRephraseModel()->resetUsageStats();
$this->helper->getEmbeddingModel()->resetUsageStats();
$result = $this->helper->askChatQuestion($q, $history);
$this->colors->ptln("Interpretation: {$result['question']}", Colors::C_LIGHTPURPLE);
$history[] = [$result['question'], $result['answer']];
$this->printAnswer($result);
}
}
/**
* Print information about the available models
*
* @return void
*/
protected function models()
{
$result = (new ModelFactory($this->conf))->getModels();
$td = new TableFormatter($this->colors);
$cols = [30, 20, 20, '*'];
echo "==== Chat Models ====\n\n";
echo $td->format(
$cols,
['Model', 'Token Limits', 'Price USD/M', 'Description'],
[Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE]
);
foreach ($result['chat'] as $name => $info) {
echo $td->format(
$cols,
[
$name,
sprintf(" In: %7d\nOut: %7d", $info['inputTokens'], $info['outputTokens']),
sprintf(" In: %.2f\nOut: %.2f", $info['inputTokenPrice'], $info['outputTokenPrice']),
$info['description'] . "\n"
],
[
$info['instance'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED,
]
);
}
$cols = [30, 10, 10, 10, '*'];
echo "==== Embedding Models ====\n\n";
echo $td->format(
$cols,
['Model', 'Token Limits', 'Price USD/M', 'Dimensions', 'Description'],
[Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE]
);
foreach ($result['embedding'] as $name => $info) {
echo $td->format(
$cols,
[
$name,
sprintf("%7d", $info['inputTokens']),
sprintf("%.2f", $info['inputTokenPrice']),
$info['dimensions'],
$info['description'] . "\n"
],
[
$info['instance'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED,
]
);
}
$this->colors->ptln('Current prices may differ', Colors::C_RED);
}
/**
* Handle a single, standalone question
*
* @param string $query
* @return void
* @throws Exception
*/
protected function ask($query)
{
$result = $this->helper->askQuestion($query);
$this->printAnswer($result);
}
/**
* Get the pages that are similar to the query
*
* @param string $query
* @return void
*/
protected function similar($query)
{
$langlimit = $this->helper->getLanguageLimit();
if ($langlimit) {
$this->info('Limiting results to {lang}', ['lang' => $langlimit]);
}
$sources = $this->helper->getEmbeddings()->getSimilarChunks($query, $langlimit);
$this->printSources($sources);
}
/**
* Run the maintenance tasks
*
* @return void
*/
protected function runMaintenance()
{
$start = time();
$this->helper->getStorage()->runMaintenance();
$this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]);
$this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]);
$data = $this->helper->getRunData();
$data['maintenance ran at'] = dformat();
$this->helper->setRunData($data);
}
/**
* Recreate chunks and embeddings for all pages
*
* @return void
*/
protected function createEmbeddings($clear)
{
[$skipRE, $matchRE] = $this->getRegexps();
$data = $this->helper->getRunData();
$lastEmbedModel = $data['embed used'] ?? '';
if (
!$clear && $lastEmbedModel &&
$lastEmbedModel != (string) $this->helper->getEmbeddingModel()
) {
$this->warning('Embedding model has changed since last run. Forcing an index rebuild');
$clear = true;
}
$data['embed ran at'] = dformat();
$data['embed used'] = (string) $this->helper->getEmbeddingModel();
$this->helper->setRunData($data);
$start = time();
$this->helper->getEmbeddings()->createNewIndex($skipRE, $matchRE, $clear);
$this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]);
$this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]);
}
/**
* Dump TSV files for debugging
*
* @return void
*/
protected function tsv($vector, $meta)
{
$storage = $this->helper->getStorage();
$storage->dumpTSV($vector, $meta);
$this->success('written to ' . $vector . ' and ' . $meta);
}
/**
* Print the given detailed answer in a nice way
*
* @param array $answer
* @return void
*/
protected function printAnswer($answer)
{
$this->colors->ptln($answer['answer'], Colors::C_LIGHTCYAN);
echo "\n";
$this->printSources($answer['sources']);
echo "\n";
$this->printUsage();
}
/**
* Print the given sources
*
* @param Chunk[] $sources
* @return void
*/
protected function printSources($sources)
{
foreach ($sources as $source) {
/** @var Chunk $source */
$this->colors->ptln(
"\t" . $source->getPage() . ' ' . $source->getId() . ' (' . $source->getScore() . ')',
Colors::C_LIGHTBLUE
);
}
}
/**
* Print the usage statistics for OpenAI
*
* @return void
*/
protected function printUsage()
{
$chat = $this->helper->getChatModel()->getUsageStats();
$rephrase = $this->helper->getRephraseModel()->getUsageStats();
$embed = $this->helper->getEmbeddingModel()->getUsageStats();
$this->info(
'Made {requests} requests in {time}s to models. Used {tokens} tokens for about ${cost}.',
[
'requests' => $chat['requests'] + $rephrase['requests'] + $embed['requests'],
'time' => $chat['time'] + $rephrase['time'] + $embed['time'],
'tokens' => $chat['tokens'] + $chat['tokens'] + $embed['tokens'],
'cost' => $chat['cost'] + $chat['cost'] + $embed['cost'],
]
);
}
/**
* Interactively ask for a value from the user
*
* @param string $prompt
* @return string
*/
protected function readLine($prompt)
{
$value = '';
while ($value === '') {
echo $prompt;
echo ': ';
$fh = fopen('php://stdin', 'r');
$value = trim(fgets($fh));
fclose($fh);
}
return $value;
}
/**
* Read the skip and match regex from the config
*
* Ensures the regular expressions are valid
*
* @return string[] [$skipRE, $matchRE]
*/
protected function getRegexps()
{
$skip = $this->getConf('skipRegex');
$skipRE = '';
$match = $this->getConf('matchRegex');
$matchRE = '';
if ($skip) {
$skipRE = '/' . $skip . '/';
if (@preg_match($skipRE, '') === false) {
$this->error(preg_last_error_msg());
$this->error('Invalid regular expression in $conf[\'skipRegex\']. Ignored.');
$skipRE = '';
} else {
$this->success('Skipping pages matching ' . $skipRE);
}
}
if ($match) {
$matchRE = '/' . $match . '/';
if (@preg_match($matchRE, '') === false) {
$this->error(preg_last_error_msg());
$this->error('Invalid regular expression in $conf[\'matchRegex\']. Ignored.');
$matchRE = '';
} else {
$this->success('Only indexing pages matching ' . $matchRE);
}
}
return [$skipRE, $matchRE];
}
}