Compare commits
	
		
			1 Commits
		
	
	
		
			brucemacd/
			...
			jessegross
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					c14f348ffa | 
							
								
								
									
										392
									
								
								llama/base64.hpp
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										392
									
								
								llama/base64.hpp
									
									
									
									
										vendored
									
									
								
							@@ -1,392 +0,0 @@
 | 
			
		||||
/*
 | 
			
		||||
This is free and unencumbered software released into the public domain.
 | 
			
		||||
 | 
			
		||||
Anyone is free to copy, modify, publish, use, compile, sell, or
 | 
			
		||||
distribute this software, either in source code form or as a compiled
 | 
			
		||||
binary, for any purpose, commercial or non-commercial, and by any
 | 
			
		||||
means.
 | 
			
		||||
 | 
			
		||||
In jurisdictions that recognize copyright laws, the author or authors
 | 
			
		||||
of this software dedicate any and all copyright interest in the
 | 
			
		||||
software to the public domain. We make this dedication for the benefit
 | 
			
		||||
of the public at large and to the detriment of our heirs and
 | 
			
		||||
successors. We intend this dedication to be an overt act of
 | 
			
		||||
relinquishment in perpetuity of all present and future rights to this
 | 
			
		||||
software under copyright law.
 | 
			
		||||
 | 
			
		||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
			
		||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
			
		||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
			
		||||
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 | 
			
		||||
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 | 
			
		||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 | 
			
		||||
OTHER DEALINGS IN THE SOFTWARE.
 | 
			
		||||
 | 
			
		||||
For more information, please refer to <http://unlicense.org>
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
#ifndef PUBLIC_DOMAIN_BASE64_HPP_
 | 
			
		||||
#define PUBLIC_DOMAIN_BASE64_HPP_
 | 
			
		||||
 | 
			
		||||
#include <cstdint>
 | 
			
		||||
#include <iterator>
 | 
			
		||||
#include <stdexcept>
 | 
			
		||||
#include <string>
 | 
			
		||||
 | 
			
		||||
class base64_error : public std::runtime_error
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    using std::runtime_error::runtime_error;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class base64
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    enum class alphabet
 | 
			
		||||
    {
 | 
			
		||||
        /** the alphabet is detected automatically */
 | 
			
		||||
        auto_,
 | 
			
		||||
        /** the standard base64 alphabet is used */
 | 
			
		||||
        standard,
 | 
			
		||||
        /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
 | 
			
		||||
        url_filename_safe
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    enum class decoding_behavior
 | 
			
		||||
    {
 | 
			
		||||
        /** if the input is not padded, the remaining bits are ignored */
 | 
			
		||||
        moderate,
 | 
			
		||||
        /** if a padding character is encounter decoding is finished */
 | 
			
		||||
        loose
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     Encodes all the elements from `in_begin` to `in_end` to `out`.
 | 
			
		||||
 | 
			
		||||
     @warning The source and destination cannot overlap. The destination must be able to hold at least
 | 
			
		||||
     `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
 | 
			
		||||
 | 
			
		||||
     @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
 | 
			
		||||
     8 bits
 | 
			
		||||
     @tparam Output_iterator the destination; the elements written to it are from the type `char`
 | 
			
		||||
     @param in_begin the beginning of the source
 | 
			
		||||
     @param in_end the ending of the source
 | 
			
		||||
     @param out the destination iterator
 | 
			
		||||
     @param alphabet which alphabet should be used
 | 
			
		||||
     @returns the iterator to the next element past the last element copied
 | 
			
		||||
     @throws see `Input_iterator` and `Output_iterator`
 | 
			
		||||
    */
 | 
			
		||||
    template<typename Input_iterator, typename Output_iterator>
 | 
			
		||||
    static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
 | 
			
		||||
                                  alphabet alphabet = alphabet::standard)
 | 
			
		||||
    {
 | 
			
		||||
        constexpr auto pad = '=';
 | 
			
		||||
        const char* alpha  = alphabet == alphabet::url_filename_safe
 | 
			
		||||
                                ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
 | 
			
		||||
                                : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 | 
			
		||||
 | 
			
		||||
        while (in_begin != in_end) {
 | 
			
		||||
            std::uint8_t i0 = 0, i1 = 0, i2 = 0;
 | 
			
		||||
 | 
			
		||||
            // first character
 | 
			
		||||
            i0 = static_cast<std::uint8_t>(*in_begin);
 | 
			
		||||
            ++in_begin;
 | 
			
		||||
 | 
			
		||||
            *out = alpha[i0 >> 2 & 0x3f];
 | 
			
		||||
            ++out;
 | 
			
		||||
 | 
			
		||||
            // part of first character and second
 | 
			
		||||
            if (in_begin != in_end) {
 | 
			
		||||
                i1 = static_cast<std::uint8_t>(*in_begin);
 | 
			
		||||
                ++in_begin;
 | 
			
		||||
 | 
			
		||||
                *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
 | 
			
		||||
                ++out;
 | 
			
		||||
            } else {
 | 
			
		||||
                *out = alpha[(i0 & 0x3) << 4];
 | 
			
		||||
                ++out;
 | 
			
		||||
 | 
			
		||||
                // last padding
 | 
			
		||||
                *out = pad;
 | 
			
		||||
                ++out;
 | 
			
		||||
 | 
			
		||||
                // last padding
 | 
			
		||||
                *out = pad;
 | 
			
		||||
                ++out;
 | 
			
		||||
 | 
			
		||||
                break;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // part of second character and third
 | 
			
		||||
            if (in_begin != in_end) {
 | 
			
		||||
                i2 = static_cast<std::uint8_t>(*in_begin);
 | 
			
		||||
                ++in_begin;
 | 
			
		||||
 | 
			
		||||
                *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
 | 
			
		||||
                ++out;
 | 
			
		||||
            } else {
 | 
			
		||||
                *out = alpha[(i1 & 0xf) << 2];
 | 
			
		||||
                ++out;
 | 
			
		||||
 | 
			
		||||
                // last padding
 | 
			
		||||
                *out = pad;
 | 
			
		||||
                ++out;
 | 
			
		||||
 | 
			
		||||
                break;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // rest of third
 | 
			
		||||
            *out = alpha[i2 & 0x3f];
 | 
			
		||||
            ++out;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return out;
 | 
			
		||||
    }
 | 
			
		||||
    /**
 | 
			
		||||
     Encodes a string.
 | 
			
		||||
 | 
			
		||||
     @param str the string that should be encoded
 | 
			
		||||
     @param alphabet which alphabet should be used
 | 
			
		||||
     @returns the encoded base64 string
 | 
			
		||||
     @throws see base64::encode()
 | 
			
		||||
    */
 | 
			
		||||
    static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
 | 
			
		||||
    {
 | 
			
		||||
        std::string result;
 | 
			
		||||
 | 
			
		||||
        result.reserve(required_encode_size(str.length()) + 1);
 | 
			
		||||
 | 
			
		||||
        encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
 | 
			
		||||
 | 
			
		||||
        return result;
 | 
			
		||||
    }
 | 
			
		||||
    /**
 | 
			
		||||
     Encodes a char array.
 | 
			
		||||
 | 
			
		||||
     @param buffer the char array
 | 
			
		||||
     @param size the size of the array
 | 
			
		||||
     @param alphabet which alphabet should be used
 | 
			
		||||
     @returns the encoded string
 | 
			
		||||
    */
 | 
			
		||||
    static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
 | 
			
		||||
    {
 | 
			
		||||
        std::string result;
 | 
			
		||||
 | 
			
		||||
        result.reserve(required_encode_size(size) + 1);
 | 
			
		||||
 | 
			
		||||
        encode(buffer, buffer + size, std::back_inserter(result), alphabet);
 | 
			
		||||
 | 
			
		||||
        return result;
 | 
			
		||||
    }
 | 
			
		||||
    /**
 | 
			
		||||
     Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
 | 
			
		||||
     in other words: inplace decoding is possible.
 | 
			
		||||
 | 
			
		||||
     @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
 | 
			
		||||
     otherwise the behavior depends on the output iterator.
 | 
			
		||||
 | 
			
		||||
     @tparam Input_iterator the source; the returned elements are cast to `char`
 | 
			
		||||
     @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
 | 
			
		||||
     @param in_begin the beginning of the source
 | 
			
		||||
     @param in_end the ending of the source
 | 
			
		||||
     @param out the destination iterator
 | 
			
		||||
     @param alphabet which alphabet should be used
 | 
			
		||||
     @param behavior the behavior when an error was detected
 | 
			
		||||
     @returns the iterator to the next element past the last element copied
 | 
			
		||||
     @throws base64_error depending on the set behavior
 | 
			
		||||
     @throws see `Input_iterator` and `Output_iterator`
 | 
			
		||||
    */
 | 
			
		||||
    template<typename Input_iterator, typename Output_iterator>
 | 
			
		||||
    static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
 | 
			
		||||
                                  alphabet alphabet          = alphabet::auto_,
 | 
			
		||||
                                  decoding_behavior behavior = decoding_behavior::moderate)
 | 
			
		||||
    {
 | 
			
		||||
        //constexpr auto pad = '=';
 | 
			
		||||
        std::uint8_t last  = 0;
 | 
			
		||||
        auto bits          = 0;
 | 
			
		||||
 | 
			
		||||
        while (in_begin != in_end) {
 | 
			
		||||
            auto c = *in_begin;
 | 
			
		||||
            ++in_begin;
 | 
			
		||||
 | 
			
		||||
            if (c == '=') {
 | 
			
		||||
                break;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            auto part = _base64_value(alphabet, c);
 | 
			
		||||
 | 
			
		||||
            // enough bits for one byte
 | 
			
		||||
            if (bits + 6 >= 8) {
 | 
			
		||||
                *out = (last << (8 - bits)) | (part >> (bits - 2));
 | 
			
		||||
                ++out;
 | 
			
		||||
 | 
			
		||||
                bits -= 2;
 | 
			
		||||
            } else {
 | 
			
		||||
                bits += 6;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            last = part;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // check padding
 | 
			
		||||
        if (behavior != decoding_behavior::loose) {
 | 
			
		||||
            while (in_begin != in_end) {
 | 
			
		||||
                auto c = *in_begin;
 | 
			
		||||
                ++in_begin;
 | 
			
		||||
 | 
			
		||||
                if (c != '=') {
 | 
			
		||||
                    throw base64_error("invalid base64 character.");
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return out;
 | 
			
		||||
    }
 | 
			
		||||
    /**
 | 
			
		||||
     Decodes a string.
 | 
			
		||||
 | 
			
		||||
     @param str the base64 encoded string
 | 
			
		||||
     @param alphabet which alphabet should be used
 | 
			
		||||
     @param behavior the behavior when an error was detected
 | 
			
		||||
     @returns the decoded string
 | 
			
		||||
     @throws see base64::decode()
 | 
			
		||||
    */
 | 
			
		||||
    static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
 | 
			
		||||
                              decoding_behavior behavior = decoding_behavior::moderate)
 | 
			
		||||
    {
 | 
			
		||||
        std::string result;
 | 
			
		||||
 | 
			
		||||
        result.reserve(max_decode_size(str.length()));
 | 
			
		||||
 | 
			
		||||
        decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
 | 
			
		||||
 | 
			
		||||
        return result;
 | 
			
		||||
    }
 | 
			
		||||
    /**
 | 
			
		||||
     Decodes a string.
 | 
			
		||||
 | 
			
		||||
     @param buffer the base64 encoded buffer
 | 
			
		||||
     @param size the size of the buffer
 | 
			
		||||
     @param alphabet which alphabet should be used
 | 
			
		||||
     @param behavior the behavior when an error was detected
 | 
			
		||||
     @returns the decoded string
 | 
			
		||||
     @throws see base64::decode()
 | 
			
		||||
    */
 | 
			
		||||
    static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
 | 
			
		||||
                              decoding_behavior behavior = decoding_behavior::moderate)
 | 
			
		||||
    {
 | 
			
		||||
        std::string result;
 | 
			
		||||
 | 
			
		||||
        result.reserve(max_decode_size(size));
 | 
			
		||||
 | 
			
		||||
        decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
 | 
			
		||||
 | 
			
		||||
        return result;
 | 
			
		||||
    }
 | 
			
		||||
    /**
 | 
			
		||||
     Decodes a string inplace.
 | 
			
		||||
 | 
			
		||||
     @param[in,out] str the base64 encoded string
 | 
			
		||||
     @param alphabet which alphabet should be used
 | 
			
		||||
     @param behavior the behavior when an error was detected
 | 
			
		||||
     @throws base64::decode_inplace()
 | 
			
		||||
    */
 | 
			
		||||
    static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
 | 
			
		||||
                               decoding_behavior behavior = decoding_behavior::moderate)
 | 
			
		||||
    {
 | 
			
		||||
        str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
 | 
			
		||||
    }
 | 
			
		||||
    /**
 | 
			
		||||
     Decodes a char array inplace.
 | 
			
		||||
 | 
			
		||||
     @param[in,out] str the string array
 | 
			
		||||
     @param size the length of the array
 | 
			
		||||
     @param alphabet which alphabet should be used
 | 
			
		||||
     @param behavior the behavior when an error was detected
 | 
			
		||||
     @returns the pointer to the next element past the last element decoded
 | 
			
		||||
     @throws base64::decode_inplace()
 | 
			
		||||
    */
 | 
			
		||||
    static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
 | 
			
		||||
                                decoding_behavior behavior = decoding_behavior::moderate)
 | 
			
		||||
    {
 | 
			
		||||
        return decode(str, str + size, str, alphabet, behavior);
 | 
			
		||||
    }
 | 
			
		||||
    /**
 | 
			
		||||
     Returns the required decoding size for a given size. The value is calculated with the following formula:
 | 
			
		||||
 | 
			
		||||
     $$
 | 
			
		||||
     \lceil \frac{size}{4} \rceil \cdot 3
 | 
			
		||||
     $$
 | 
			
		||||
 | 
			
		||||
     @param size the size of the encoded input
 | 
			
		||||
     @returns the size of the resulting decoded buffer; this the absolute maximum
 | 
			
		||||
    */
 | 
			
		||||
    static std::size_t max_decode_size(std::size_t size) noexcept
 | 
			
		||||
    {
 | 
			
		||||
        return (size / 4 + (size % 4 ? 1 : 0)) * 3;
 | 
			
		||||
    }
 | 
			
		||||
    /**
 | 
			
		||||
     Returns the required encoding size for a given size. The value is calculated with the following formula:
 | 
			
		||||
 | 
			
		||||
     $$
 | 
			
		||||
     \lceil \frac{size}{3} \rceil \cdot 4
 | 
			
		||||
     $$
 | 
			
		||||
 | 
			
		||||
     @param size the size of the decoded input
 | 
			
		||||
     @returns the size of the resulting encoded buffer
 | 
			
		||||
    */
 | 
			
		||||
    static std::size_t required_encode_size(std::size_t size) noexcept
 | 
			
		||||
    {
 | 
			
		||||
        return (size / 3 + (size % 3 ? 1 : 0)) * 4;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    static std::uint8_t _base64_value(alphabet& alphabet, char c)
 | 
			
		||||
    {
 | 
			
		||||
        if (c >= 'A' && c <= 'Z') {
 | 
			
		||||
            return c - 'A';
 | 
			
		||||
        } else if (c >= 'a' && c <= 'z') {
 | 
			
		||||
            return c - 'a' + 26;
 | 
			
		||||
        } else if (c >= '0' && c <= '9') {
 | 
			
		||||
            return c - '0' + 52;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // comes down to alphabet
 | 
			
		||||
        if (alphabet == alphabet::standard) {
 | 
			
		||||
            if (c == '+') {
 | 
			
		||||
                return 62;
 | 
			
		||||
            } else if (c == '/') {
 | 
			
		||||
                return 63;
 | 
			
		||||
            }
 | 
			
		||||
        } else if (alphabet == alphabet::url_filename_safe) {
 | 
			
		||||
            if (c == '-') {
 | 
			
		||||
                return 62;
 | 
			
		||||
            } else if (c == '_') {
 | 
			
		||||
                return 63;
 | 
			
		||||
            }
 | 
			
		||||
        } // auto detect
 | 
			
		||||
        else {
 | 
			
		||||
            if (c == '+') {
 | 
			
		||||
                alphabet = alphabet::standard;
 | 
			
		||||
 | 
			
		||||
                return 62;
 | 
			
		||||
            } else if (c == '/') {
 | 
			
		||||
                alphabet = alphabet::standard;
 | 
			
		||||
 | 
			
		||||
                return 63;
 | 
			
		||||
            } else if (c == '-') {
 | 
			
		||||
                alphabet = alphabet::url_filename_safe;
 | 
			
		||||
 | 
			
		||||
                return 62;
 | 
			
		||||
            } else if (c == '_') {
 | 
			
		||||
                alphabet = alphabet::url_filename_safe;
 | 
			
		||||
 | 
			
		||||
                return 63;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        throw base64_error("invalid base64 character.");
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#endif // !PUBLIC_DOMAIN_BASE64_HPP_
 | 
			
		||||
							
								
								
									
										2092
									
								
								llama/common.cpp
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2092
									
								
								llama/common.cpp
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										581
									
								
								llama/common.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										581
									
								
								llama/common.h
									
									
									
									
										vendored
									
									
								
							@@ -1,581 +0,0 @@
 | 
			
		||||
/**
 | 
			
		||||
 * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
 | 
			
		||||
 *
 | 
			
		||||
 * MIT License
 | 
			
		||||
 *
 | 
			
		||||
 * Copyright (c) 2023-2024 The ggml authors
 | 
			
		||||
 *
 | 
			
		||||
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 * of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 * in the Software without restriction, including without limitation the rights
 | 
			
		||||
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 * copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 * furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 * The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 * copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 * SOFTWARE.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// Various helper functions and utilities
 | 
			
		||||
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#include "llama.h"
 | 
			
		||||
 | 
			
		||||
#include <string>
 | 
			
		||||
#include <vector>
 | 
			
		||||
#include <sstream>
 | 
			
		||||
 | 
			
		||||
#ifdef _WIN32
 | 
			
		||||
#define DIRECTORY_SEPARATOR '\\'
 | 
			
		||||
#else
 | 
			
		||||
#define DIRECTORY_SEPARATOR '/'
 | 
			
		||||
#endif // _WIN32
 | 
			
		||||
 | 
			
		||||
#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
 | 
			
		||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
 | 
			
		||||
 | 
			
		||||
#define print_build_info() do {                                                                     \
 | 
			
		||||
    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
 | 
			
		||||
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 | 
			
		||||
} while(0)
 | 
			
		||||
 | 
			
		||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
 | 
			
		||||
 | 
			
		||||
struct llama_lora_adapter_info {
 | 
			
		||||
    std::string path;
 | 
			
		||||
    float scale;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct llama_lora_adapter_container : llama_lora_adapter_info {
 | 
			
		||||
    struct llama_lora_adapter * adapter;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// build info
 | 
			
		||||
extern int LLAMA_BUILD_NUMBER;
 | 
			
		||||
extern char const * LLAMA_COMMIT;
 | 
			
		||||
extern char const * LLAMA_COMPILER;
 | 
			
		||||
extern char const * LLAMA_BUILD_TARGET;
 | 
			
		||||
 | 
			
		||||
struct llama_control_vector_load_info;
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// CPU utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
struct cpu_params {
 | 
			
		||||
    int      n_threads                   = -1;
 | 
			
		||||
    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
 | 
			
		||||
    bool     mask_valid                  = false;   // Default: any CPU
 | 
			
		||||
    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
 | 
			
		||||
    bool     strict_cpu                  = false;   // Use strict CPU placement
 | 
			
		||||
    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
int32_t cpu_get_num_physical_cores();
 | 
			
		||||
int32_t cpu_get_num_math();
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// Common params
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
enum llama_example {
 | 
			
		||||
    LLAMA_EXAMPLE_COMMON,
 | 
			
		||||
    LLAMA_EXAMPLE_SPECULATIVE,
 | 
			
		||||
    LLAMA_EXAMPLE_MAIN,
 | 
			
		||||
    LLAMA_EXAMPLE_INFILL,
 | 
			
		||||
    LLAMA_EXAMPLE_EMBEDDING,
 | 
			
		||||
    LLAMA_EXAMPLE_PERPLEXITY,
 | 
			
		||||
    LLAMA_EXAMPLE_RETRIEVAL,
 | 
			
		||||
    LLAMA_EXAMPLE_PASSKEY,
 | 
			
		||||
    LLAMA_EXAMPLE_IMATRIX,
 | 
			
		||||
    LLAMA_EXAMPLE_BENCH,
 | 
			
		||||
    LLAMA_EXAMPLE_SERVER,
 | 
			
		||||
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
 | 
			
		||||
    LLAMA_EXAMPLE_EXPORT_LORA,
 | 
			
		||||
    LLAMA_EXAMPLE_LLAVA,
 | 
			
		||||
    LLAMA_EXAMPLE_LOOKUP,
 | 
			
		||||
    LLAMA_EXAMPLE_PARALLEL,
 | 
			
		||||
 | 
			
		||||
    LLAMA_EXAMPLE_COUNT,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
enum gpt_sampler_type {
 | 
			
		||||
    GPT_SAMPLER_TYPE_NONE        = 0,
 | 
			
		||||
    GPT_SAMPLER_TYPE_TOP_K       = 1,
 | 
			
		||||
    GPT_SAMPLER_TYPE_TOP_P       = 2,
 | 
			
		||||
    GPT_SAMPLER_TYPE_MIN_P       = 3,
 | 
			
		||||
    GPT_SAMPLER_TYPE_TFS_Z       = 4,
 | 
			
		||||
    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
 | 
			
		||||
    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// dimensionality reduction methods, used by cvector-generator
 | 
			
		||||
enum dimre_method {
 | 
			
		||||
    DIMRE_METHOD_PCA,
 | 
			
		||||
    DIMRE_METHOD_MEAN,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// sampler parameters
 | 
			
		||||
struct gpt_sampler_params {
 | 
			
		||||
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
 | 
			
		||||
 | 
			
		||||
    int32_t n_prev            = 64;    // number of previous tokens to remember
 | 
			
		||||
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
 | 
			
		||||
    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
 | 
			
		||||
    int32_t top_k             = 40;    // <= 0 to use vocab size
 | 
			
		||||
    float   top_p             = 0.95f; // 1.0 = disabled
 | 
			
		||||
    float   min_p             = 0.05f; // 0.0 = disabled
 | 
			
		||||
    float   tfs_z             = 1.00f; // 1.0 = disabled
 | 
			
		||||
    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
 | 
			
		||||
    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
 | 
			
		||||
    float   dynatemp_range    = 0.00f; // 0.0 = disabled
 | 
			
		||||
    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
 | 
			
		||||
    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
 | 
			
		||||
    float   penalty_repeat    = 1.00f; // 1.0 = disabled
 | 
			
		||||
    float   penalty_freq      = 0.00f; // 0.0 = disabled
 | 
			
		||||
    float   penalty_present   = 0.00f; // 0.0 = disabled
 | 
			
		||||
    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
 | 
			
		||||
    float   mirostat_tau      = 5.00f; // target entropy
 | 
			
		||||
    float   mirostat_eta      = 0.10f; // learning rate
 | 
			
		||||
    bool    penalize_nl       = false; // consider newlines as a repeatable token
 | 
			
		||||
    bool    ignore_eos        = false;
 | 
			
		||||
    bool    no_perf           = false; // disable performance metrics
 | 
			
		||||
 | 
			
		||||
    std::vector<enum gpt_sampler_type> samplers = {
 | 
			
		||||
        GPT_SAMPLER_TYPE_TOP_K,
 | 
			
		||||
        GPT_SAMPLER_TYPE_TFS_Z,
 | 
			
		||||
        GPT_SAMPLER_TYPE_TYPICAL_P,
 | 
			
		||||
        GPT_SAMPLER_TYPE_TOP_P,
 | 
			
		||||
        GPT_SAMPLER_TYPE_MIN_P,
 | 
			
		||||
        GPT_SAMPLER_TYPE_TEMPERATURE
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    std::string grammar; // optional BNF-like grammar to constrain sampling
 | 
			
		||||
 | 
			
		||||
    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
 | 
			
		||||
 | 
			
		||||
    // print the parameters into a string
 | 
			
		||||
    std::string print() const;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct gpt_params {
 | 
			
		||||
    int32_t n_predict             =    -1; // new tokens to predict
 | 
			
		||||
    int32_t n_ctx                 =     0; // context size
 | 
			
		||||
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
 | 
			
		||||
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
 | 
			
		||||
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
 | 
			
		||||
    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
 | 
			
		||||
    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
 | 
			
		||||
    int32_t n_parallel            =     1; // number of parallel sequences to decode
 | 
			
		||||
    int32_t n_sequences           =     1; // number of sequences to decode
 | 
			
		||||
    float   p_split               =  0.1f; // speculative decoding split probability
 | 
			
		||||
    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
 | 
			
		||||
    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
 | 
			
		||||
    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
 | 
			
		||||
    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
 | 
			
		||||
    int32_t grp_attn_n            =     1; // group-attention factor
 | 
			
		||||
    int32_t grp_attn_w            =   512; // group-attention width
 | 
			
		||||
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
 | 
			
		||||
    float   rope_freq_base        =  0.0f; // RoPE base frequency
 | 
			
		||||
    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
 | 
			
		||||
    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
 | 
			
		||||
    float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor
 | 
			
		||||
    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
 | 
			
		||||
    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
 | 
			
		||||
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
 | 
			
		||||
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
 | 
			
		||||
 | 
			
		||||
    struct cpu_params cpuparams;
 | 
			
		||||
    struct cpu_params cpuparams_batch;
 | 
			
		||||
    struct cpu_params draft_cpuparams;
 | 
			
		||||
    struct cpu_params draft_cpuparams_batch;
 | 
			
		||||
 | 
			
		||||
    ggml_backend_sched_eval_callback cb_eval = nullptr;
 | 
			
		||||
    void * cb_eval_user_data                 = nullptr;
 | 
			
		||||
 | 
			
		||||
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
 | 
			
		||||
 | 
			
		||||
    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
 | 
			
		||||
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
 | 
			
		||||
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
 | 
			
		||||
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
 | 
			
		||||
 | 
			
		||||
    struct gpt_sampler_params sparams;
 | 
			
		||||
 | 
			
		||||
    std::string model                = ""; // model path                                                    // NOLINT
 | 
			
		||||
    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
 | 
			
		||||
    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
 | 
			
		||||
    std::string model_url            = ""; // model url to download                                         // NOLINT
 | 
			
		||||
    std::string hf_token             = ""; // HF token                                                      // NOLINT
 | 
			
		||||
    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
 | 
			
		||||
    std::string hf_file              = ""; // HF file                                                       // NOLINT
 | 
			
		||||
    std::string prompt               = "";                                                                  // NOLINT
 | 
			
		||||
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
 | 
			
		||||
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
 | 
			
		||||
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
 | 
			
		||||
    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
 | 
			
		||||
    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
 | 
			
		||||
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
 | 
			
		||||
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
 | 
			
		||||
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
 | 
			
		||||
    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT
 | 
			
		||||
 | 
			
		||||
    std::vector<std::string> in_files;   // all input files
 | 
			
		||||
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
 | 
			
		||||
    std::vector<llama_model_kv_override> kv_overrides;
 | 
			
		||||
 | 
			
		||||
    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
 | 
			
		||||
    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
 | 
			
		||||
 | 
			
		||||
    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
 | 
			
		||||
 | 
			
		||||
    int32_t verbosity                  = 0;
 | 
			
		||||
    int32_t control_vector_layer_start = -1; // layer range for control vector
 | 
			
		||||
    int32_t control_vector_layer_end   = -1; // layer range for control vector
 | 
			
		||||
 | 
			
		||||
    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
 | 
			
		||||
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
 | 
			
		||||
                                     //                                       (which is more convenient to use for plotting)
 | 
			
		||||
                                     //
 | 
			
		||||
    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
 | 
			
		||||
    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
 | 
			
		||||
 | 
			
		||||
    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
 | 
			
		||||
    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
 | 
			
		||||
 | 
			
		||||
    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
 | 
			
		||||
    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
 | 
			
		||||
 | 
			
		||||
    bool   kl_divergence    = false; // compute KL divergence
 | 
			
		||||
 | 
			
		||||
    bool usage             = false; // print usage
 | 
			
		||||
    bool use_color         = false; // use color to distinguish generations and inputs
 | 
			
		||||
    bool special           = false; // enable special token output
 | 
			
		||||
    bool interactive       = false; // interactive mode
 | 
			
		||||
    bool interactive_first = false; // wait for user input immediately
 | 
			
		||||
    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
 | 
			
		||||
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
 | 
			
		||||
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
 | 
			
		||||
 | 
			
		||||
    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
 | 
			
		||||
    bool multiline_input   = false; // reverse the usage of `\`
 | 
			
		||||
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
 | 
			
		||||
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
 | 
			
		||||
    bool flash_attn        = false; // flash attention
 | 
			
		||||
    bool no_perf           = false; // disable performance metrics
 | 
			
		||||
    bool ctx_shift         = true;  // context shift on inifinite text generation
 | 
			
		||||
 | 
			
		||||
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
 | 
			
		||||
    bool logits_all        = false; // return logits for all tokens in the batch
 | 
			
		||||
    bool use_mmap          = true;  // use mmap for faster loads
 | 
			
		||||
    bool use_mlock         = false; // use mlock to keep model in memory
 | 
			
		||||
    bool verbose_prompt    = false; // print prompt tokens before generation
 | 
			
		||||
    bool display_prompt    = true;  // print prompt before generation
 | 
			
		||||
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
 | 
			
		||||
    bool no_kv_offload     = false; // disable KV offloading
 | 
			
		||||
    bool warmup            = true;  // warmup run
 | 
			
		||||
    bool check_tensors     = false; // validate tensor data
 | 
			
		||||
 | 
			
		||||
    std::string cache_type_k = "f16"; // KV cache data type for the K
 | 
			
		||||
    std::string cache_type_v = "f16"; // KV cache data type for the V
 | 
			
		||||
 | 
			
		||||
    // multimodal models (see examples/llava)
 | 
			
		||||
    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
 | 
			
		||||
    std::vector<std::string> image; // path to image file(s)
 | 
			
		||||
 | 
			
		||||
    // embedding
 | 
			
		||||
    bool embedding         = false; // get only sentence embedding
 | 
			
		||||
    int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
 | 
			
		||||
    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
 | 
			
		||||
    std::string embd_sep   = "\n";  // separator of embendings
 | 
			
		||||
    bool reranking         = false; // enable reranking support on server
 | 
			
		||||
 | 
			
		||||
    // server params
 | 
			
		||||
    int32_t port           = 8080;         // server listens on this network port
 | 
			
		||||
    int32_t timeout_read   = 600;          // http read timeout in seconds
 | 
			
		||||
    int32_t timeout_write  = timeout_read; // http write timeout in seconds
 | 
			
		||||
    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
 | 
			
		||||
 | 
			
		||||
    std::string hostname      = "127.0.0.1";
 | 
			
		||||
    std::string public_path   = "";                                                                         // NOLINT
 | 
			
		||||
    std::string chat_template = "";                                                                         // NOLINT
 | 
			
		||||
    std::string system_prompt = "";                                                                         // NOLINT
 | 
			
		||||
    bool enable_chat_template = true;
 | 
			
		||||
 | 
			
		||||
    std::vector<std::string> api_keys;
 | 
			
		||||
 | 
			
		||||
    std::string ssl_file_key  = "";                                                                         // NOLINT
 | 
			
		||||
    std::string ssl_file_cert = "";                                                                         // NOLINT
 | 
			
		||||
 | 
			
		||||
    bool endpoint_slots   = true;
 | 
			
		||||
    bool endpoint_metrics = false;
 | 
			
		||||
 | 
			
		||||
    bool log_json = false;
 | 
			
		||||
 | 
			
		||||
    std::string slot_save_path;
 | 
			
		||||
 | 
			
		||||
    float slot_prompt_similarity = 0.5f;
 | 
			
		||||
 | 
			
		||||
    // batched-bench params
 | 
			
		||||
    bool is_pp_shared = false;
 | 
			
		||||
 | 
			
		||||
    std::vector<int32_t> n_pp;
 | 
			
		||||
    std::vector<int32_t> n_tg;
 | 
			
		||||
    std::vector<int32_t> n_pl;
 | 
			
		||||
 | 
			
		||||
    // retrieval params
 | 
			
		||||
    std::vector<std::string> context_files; // context files to embed
 | 
			
		||||
 | 
			
		||||
    int32_t chunk_size = 64; // chunk size for context embedding
 | 
			
		||||
 | 
			
		||||
    std::string chunk_separator = "\n"; // chunk separator for context embedding
 | 
			
		||||
 | 
			
		||||
    // passkey params
 | 
			
		||||
    int32_t n_junk = 250; // number of times to repeat the junk text
 | 
			
		||||
    int32_t i_pos  = -1;  // position of the passkey in the junk text
 | 
			
		||||
 | 
			
		||||
    // imatrix params
 | 
			
		||||
    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
 | 
			
		||||
 | 
			
		||||
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
 | 
			
		||||
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
 | 
			
		||||
    int32_t i_chunk     =  0; // start processing from this chunk
 | 
			
		||||
 | 
			
		||||
    bool process_output = false; // collect data for the output tensor
 | 
			
		||||
    bool compute_ppl    = true;  // whether to compute perplexity
 | 
			
		||||
 | 
			
		||||
    // cvector-generator params
 | 
			
		||||
    int n_pca_batch = 100;
 | 
			
		||||
    int n_pca_iterations = 1000;
 | 
			
		||||
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
 | 
			
		||||
    std::string cvector_outfile       = "control_vector.gguf";
 | 
			
		||||
    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
 | 
			
		||||
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
 | 
			
		||||
 | 
			
		||||
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
 | 
			
		||||
 | 
			
		||||
    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
 | 
			
		||||
 | 
			
		||||
    // batched-bench params
 | 
			
		||||
    bool batched_bench_output_jsonl = false;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// call once at the start of a program if it uses libcommon
 | 
			
		||||
// initializes the logging system and prints info about the build
 | 
			
		||||
void gpt_init();
 | 
			
		||||
 | 
			
		||||
std::string gpt_params_get_system_info(const gpt_params & params);
 | 
			
		||||
 | 
			
		||||
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
 | 
			
		||||
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
 | 
			
		||||
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
 | 
			
		||||
bool set_process_priority(enum ggml_sched_priority prio);
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// String utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
std::vector<std::string> string_split(std::string input, char separator);
 | 
			
		||||
 | 
			
		||||
std::string string_strip(const std::string & str);
 | 
			
		||||
std::string string_get_sortable_timestamp();
 | 
			
		||||
 | 
			
		||||
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
 | 
			
		||||
 | 
			
		||||
template<class T>
 | 
			
		||||
static std::vector<T> string_split(const std::string & str, char delim) {
 | 
			
		||||
    std::vector<T> values;
 | 
			
		||||
    std::istringstream str_stream(str);
 | 
			
		||||
    std::string token;
 | 
			
		||||
    while (std::getline(str_stream, token, delim)) {
 | 
			
		||||
        T value;
 | 
			
		||||
        std::istringstream token_stream(token);
 | 
			
		||||
        token_stream >> value;
 | 
			
		||||
        values.push_back(value);
 | 
			
		||||
    }
 | 
			
		||||
    return values;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 | 
			
		||||
void string_process_escapes(std::string & input);
 | 
			
		||||
 | 
			
		||||
std::string string_from(bool value);
 | 
			
		||||
std::string string_from(const std::vector<int> & values);
 | 
			
		||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
 | 
			
		||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// Filesystem utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
bool fs_validate_filename(const std::string & filename);
 | 
			
		||||
bool fs_create_directory_with_parents(const std::string & path);
 | 
			
		||||
 | 
			
		||||
std::string fs_get_cache_directory();
 | 
			
		||||
std::string fs_get_cache_file(const std::string & filename);
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// Model utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
struct llama_init_result {
 | 
			
		||||
    struct llama_model   * model   = nullptr;
 | 
			
		||||
    struct llama_context * context = nullptr;
 | 
			
		||||
    std::vector<llama_lora_adapter_container> lora_adapters;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
 | 
			
		||||
 | 
			
		||||
struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
 | 
			
		||||
struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
 | 
			
		||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 | 
			
		||||
 | 
			
		||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 | 
			
		||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 | 
			
		||||
 | 
			
		||||
// clear LoRA adapters from context, then apply new list of adapters
 | 
			
		||||
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
 | 
			
		||||
 | 
			
		||||
// Batch utils
 | 
			
		||||
 | 
			
		||||
void llama_batch_clear(struct llama_batch & batch);
 | 
			
		||||
 | 
			
		||||
void llama_batch_add(
 | 
			
		||||
                 struct llama_batch & batch,
 | 
			
		||||
                        llama_token   id,
 | 
			
		||||
                          llama_pos   pos,
 | 
			
		||||
    const std::vector<llama_seq_id> & seq_ids,
 | 
			
		||||
                               bool   logits);
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// Vocab utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
// tokenizes a string into a vector of tokens
 | 
			
		||||
// should work similar to Python's `tokenizer.encode`
 | 
			
		||||
std::vector<llama_token> llama_tokenize(
 | 
			
		||||
  const struct llama_context * ctx,
 | 
			
		||||
           const std::string & text,
 | 
			
		||||
                        bool   add_special,
 | 
			
		||||
                        bool   parse_special = false);
 | 
			
		||||
 | 
			
		||||
std::vector<llama_token> llama_tokenize(
 | 
			
		||||
    const struct llama_model * model,
 | 
			
		||||
           const std::string & text,
 | 
			
		||||
                        bool   add_special,
 | 
			
		||||
                        bool   parse_special = false);
 | 
			
		||||
 | 
			
		||||
// tokenizes a token into a piece, optionally renders special/control tokens
 | 
			
		||||
// should work similar to Python's `tokenizer.id_to_piece`
 | 
			
		||||
std::string llama_token_to_piece(
 | 
			
		||||
        const struct llama_context * ctx,
 | 
			
		||||
                       llama_token   token,
 | 
			
		||||
                       bool          special = true);
 | 
			
		||||
 | 
			
		||||
// detokenizes a vector of tokens into a string
 | 
			
		||||
// should work similar to Python's `tokenizer.decode`
 | 
			
		||||
// optionally renders special/control tokens
 | 
			
		||||
std::string llama_detokenize(
 | 
			
		||||
                         llama_context * ctx,
 | 
			
		||||
        const std::vector<llama_token> & tokens,
 | 
			
		||||
                                  bool   special = true);
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// Chat template utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
// same with llama_chat_message, but uses std::string
 | 
			
		||||
struct llama_chat_msg {
 | 
			
		||||
    std::string role;
 | 
			
		||||
    std::string content;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 | 
			
		||||
bool llama_chat_verify_template(const std::string & tmpl);
 | 
			
		||||
 | 
			
		||||
// CPP wrapper for llama_chat_apply_template
 | 
			
		||||
// If the built-in template is not supported, we default to chatml
 | 
			
		||||
// If the custom "tmpl" is not supported, we throw an error
 | 
			
		||||
std::string llama_chat_apply_template(const struct llama_model * model,
 | 
			
		||||
        const std::string & tmpl,
 | 
			
		||||
        const std::vector<llama_chat_msg> & chat,
 | 
			
		||||
        bool add_ass);
 | 
			
		||||
 | 
			
		||||
// Format single message, while taking into account the position of that message in chat history
 | 
			
		||||
std::string llama_chat_format_single(const struct llama_model * model,
 | 
			
		||||
        const std::string & tmpl,
 | 
			
		||||
        const std::vector<llama_chat_msg> & past_msg,
 | 
			
		||||
        const llama_chat_msg & new_msg,
 | 
			
		||||
        bool add_ass);
 | 
			
		||||
 | 
			
		||||
// Returns an example of formatted chat
 | 
			
		||||
std::string llama_chat_format_example(const struct llama_model * model,
 | 
			
		||||
        const std::string & tmpl);
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// KV cache utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
// Dump the KV cache view with the number of sequences per cell.
 | 
			
		||||
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
 | 
			
		||||
 | 
			
		||||
// Dump the KV cache view showing individual sequences in each cell (long output).
 | 
			
		||||
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// Embedding utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
 | 
			
		||||
 | 
			
		||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// Control vector utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
struct llama_control_vector_data {
 | 
			
		||||
    int n_embd;
 | 
			
		||||
 | 
			
		||||
    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
 | 
			
		||||
    std::vector<float> data;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct llama_control_vector_load_info {
 | 
			
		||||
    float strength;
 | 
			
		||||
 | 
			
		||||
    std::string fname;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Load control vectors, scale each by strength, and add them together.
 | 
			
		||||
// On error, returns {-1, empty}
 | 
			
		||||
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// Split utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
static const char * const LLM_KV_SPLIT_NO            = "split.no";
 | 
			
		||||
static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 | 
			
		||||
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// YAML utils
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
void yaml_dump_vector_float    (FILE * stream, const char * prop_name, const std::vector<float> & data);
 | 
			
		||||
void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std::vector<int> & data);
 | 
			
		||||
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
 | 
			
		||||
 | 
			
		||||
void yaml_dump_non_result_info(
 | 
			
		||||
    FILE * stream, const gpt_params & params, const llama_context * lctx,
 | 
			
		||||
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
 | 
			
		||||
							
								
								
									
										1071
									
								
								llama/json-schema-to-grammar.cpp
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1071
									
								
								llama/json-schema-to-grammar.cpp
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										34
									
								
								llama/json-schema-to-grammar.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										34
									
								
								llama/json-schema-to-grammar.h
									
									
									
									
										vendored
									
									
								
							@@ -1,34 +0,0 @@
 | 
			
		||||
/**
 | 
			
		||||
 * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
 | 
			
		||||
 *
 | 
			
		||||
 * MIT License
 | 
			
		||||
 *
 | 
			
		||||
 * Copyright (c) 2023-2024 The ggml authors
 | 
			
		||||
 *
 | 
			
		||||
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 * of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 * in the Software without restriction, including without limitation the rights
 | 
			
		||||
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 * copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 * furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 * The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 * copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 * SOFTWARE.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#include "ggml.h"
 | 
			
		||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
 | 
			
		||||
#define JSON_ASSERT GGML_ASSERT
 | 
			
		||||
#include "json.hpp"
 | 
			
		||||
 | 
			
		||||
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
 | 
			
		||||
							
								
								
									
										24766
									
								
								llama/json.hpp
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										24766
									
								
								llama/json.hpp
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										148
									
								
								llama/llama.go
									
									
									
									
									
								
							
							
						
						
									
										148
									
								
								llama/llama.go
									
									
									
									
									
								
							@@ -67,7 +67,6 @@ package llama
 | 
			
		||||
#include "ggml.h"
 | 
			
		||||
#include "llava.h"
 | 
			
		||||
#include "mllama.h"
 | 
			
		||||
#include "sampling_ext.h"
 | 
			
		||||
 | 
			
		||||
bool llamaProgressCallback(float progress, void *user_data);
 | 
			
		||||
 | 
			
		||||
@@ -88,6 +87,7 @@ import (
 | 
			
		||||
	_ "embed"
 | 
			
		||||
	"errors"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"math"
 | 
			
		||||
	"runtime"
 | 
			
		||||
	"runtime/cgo"
 | 
			
		||||
	"slices"
 | 
			
		||||
@@ -181,6 +181,15 @@ func (c *Context) Model() *Model {
 | 
			
		||||
	return &Model{c: C.llama_get_model(c.c)}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (c *Context) GetLogitsIth(i int) ([]float32, error) {
 | 
			
		||||
	logits := (*float32)(unsafe.Pointer(C.llama_get_logits_ith(c.c, C.int(i))))
 | 
			
		||||
	if logits == nil {
 | 
			
		||||
		return nil, errors.New("unable to get logits")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return unsafe.Slice(logits, c.Model().NumVocab()), nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (c *Context) KvCacheSeqAdd(seqId int, p0 int, p1 int, delta int) {
 | 
			
		||||
	C.llama_kv_cache_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
 | 
			
		||||
}
 | 
			
		||||
@@ -613,11 +622,6 @@ func (c *Context) Synchronize() {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// sampling
 | 
			
		||||
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
 | 
			
		||||
type SamplingContext struct {
 | 
			
		||||
	c *C.struct_gpt_sampler
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type SamplingParams struct {
 | 
			
		||||
	TopK           int
 | 
			
		||||
	TopP           float32
 | 
			
		||||
@@ -637,46 +641,120 @@ type SamplingParams struct {
 | 
			
		||||
	Grammar        string
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type SamplingContext struct {
 | 
			
		||||
	chain   *C.struct_llama_sampler
 | 
			
		||||
	grammar *C.struct_llama_sampler
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, error) {
 | 
			
		||||
	var cparams C.struct_gpt_sampler_cparams
 | 
			
		||||
	cparams.top_k = C.int32_t(params.TopK)
 | 
			
		||||
	cparams.top_p = C.float(params.TopP)
 | 
			
		||||
	cparams.min_p = C.float(params.MinP)
 | 
			
		||||
	cparams.tfs_z = C.float(params.TfsZ)
 | 
			
		||||
	cparams.typical_p = C.float(params.TypicalP)
 | 
			
		||||
	cparams.temp = C.float(params.Temp)
 | 
			
		||||
	cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
 | 
			
		||||
	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
 | 
			
		||||
	cparams.penalty_freq = C.float(params.PenaltyFreq)
 | 
			
		||||
	cparams.penalty_present = C.float(params.PenaltyFreq)
 | 
			
		||||
	cparams.mirostat = C.int32_t(params.Mirostat)
 | 
			
		||||
	cparams.mirostat_tau = C.float(params.MirostatTau)
 | 
			
		||||
	cparams.mirostat_eta = C.float(params.MirostatEta)
 | 
			
		||||
	cparams.penalize_nl = C.bool(params.PenalizeNl)
 | 
			
		||||
	cparams.seed = C.uint32_t(params.Seed)
 | 
			
		||||
	var s SamplingContext
 | 
			
		||||
	runtime.SetFinalizer(&s, func(s *SamplingContext) { s.free() })
 | 
			
		||||
 | 
			
		||||
	sparams := C.llama_sampler_chain_default_params()
 | 
			
		||||
	s.chain = C.llama_sampler_chain_init(sparams)
 | 
			
		||||
 | 
			
		||||
	grammar := C.CString(params.Grammar)
 | 
			
		||||
	defer C.free(unsafe.Pointer(grammar))
 | 
			
		||||
	root := C.CString("root")
 | 
			
		||||
	defer C.free(unsafe.Pointer(root))
 | 
			
		||||
	s.grammar = C.llama_sampler_init_grammar(model.c, grammar, root)
 | 
			
		||||
 | 
			
		||||
	cparams.grammar = grammar
 | 
			
		||||
	context := &SamplingContext{c: C.gpt_sampler_cinit(model.c, &cparams)}
 | 
			
		||||
	if context.c == nil {
 | 
			
		||||
		return nil, errors.New("unable to create sampling context")
 | 
			
		||||
	C.llama_sampler_chain_add(s.chain,
 | 
			
		||||
		C.llama_sampler_init_penalties(
 | 
			
		||||
			C.llama_n_vocab(model.c),
 | 
			
		||||
			C.llama_token_eos(model.c),
 | 
			
		||||
			C.llama_token_nl(model.c),
 | 
			
		||||
			C.int32_t(params.RepeatLastN),
 | 
			
		||||
			C.float(params.PenaltyRepeat),
 | 
			
		||||
			C.float(params.PenaltyFreq),
 | 
			
		||||
			C.float(params.PenaltyPresent),
 | 
			
		||||
			C.bool(params.PenalizeNl),
 | 
			
		||||
			false))
 | 
			
		||||
 | 
			
		||||
	if params.Temp > 0 {
 | 
			
		||||
		switch params.Mirostat {
 | 
			
		||||
		case 0:
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_top_k(C.int32_t(params.TopK)))
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_tail_free(C.float(params.TfsZ), 0))
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_typical(C.float(params.TypicalP), 0))
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_top_p(C.float(params.TopP), 0))
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_min_p(C.float(params.MinP), 0))
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_temp(C.float(params.Temp)))
 | 
			
		||||
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_softmax())
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_dist(C.uint32_t(params.Seed)))
 | 
			
		||||
		case 1:
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_temp(C.float(params.Temp)))
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_mirostat(C.llama_n_vocab(model.c),
 | 
			
		||||
				C.uint32_t(params.Seed), C.float(params.MirostatTau), C.float(params.MirostatEta), 100))
 | 
			
		||||
		case 2:
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_temp(C.float(params.Temp)))
 | 
			
		||||
			C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_mirostat_v2(C.uint32_t(params.Seed),
 | 
			
		||||
				C.float(params.MirostatTau), C.float(params.MirostatEta)))
 | 
			
		||||
		default:
 | 
			
		||||
			return nil, fmt.Errorf("sampling: unknown mirostat version: %v", params.Mirostat)
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		C.llama_sampler_chain_add(s.chain, C.llama_sampler_init_greedy())
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	runtime.SetFinalizer(context, func(s *SamplingContext) { C.gpt_sampler_cfree(s.c) })
 | 
			
		||||
 | 
			
		||||
	return context, nil
 | 
			
		||||
	return &s, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *SamplingContext) Reset() {
 | 
			
		||||
	C.gpt_sampler_creset(s.c)
 | 
			
		||||
}
 | 
			
		||||
func (s *SamplingContext) Sample(llamaContext *Context, idx int) (int, error) {
 | 
			
		||||
	logits, err := llamaContext.GetLogitsIth(idx)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return 0, err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
func (s *SamplingContext) Sample(llamaContext *Context, idx int) int {
 | 
			
		||||
	return int(C.gpt_sampler_csample(s.c, llamaContext.c, C.int(idx)))
 | 
			
		||||
	numVocab := llamaContext.Model().NumVocab()
 | 
			
		||||
 | 
			
		||||
	tokenData := make([]C.llama_token_data, numVocab)
 | 
			
		||||
	var tokenDataPin runtime.Pinner
 | 
			
		||||
	tokenDataPin.Pin(&tokenData[0])
 | 
			
		||||
	defer tokenDataPin.Unpin()
 | 
			
		||||
 | 
			
		||||
	for i := range tokenData {
 | 
			
		||||
		tokenData[i] = C.llama_token_data{id: C.llama_token(i), logit: C.float(logits[i])}
 | 
			
		||||
	}
 | 
			
		||||
	tokenDataArray := C.llama_token_data_array{data: &tokenData[0], size: C.size_t(len(tokenData)), selected: -1}
 | 
			
		||||
 | 
			
		||||
	C.llama_sampler_apply(s.chain, &tokenDataArray)
 | 
			
		||||
 | 
			
		||||
	id := tokenData[tokenDataArray.selected].id
 | 
			
		||||
 | 
			
		||||
	// Check if the selected token is allowed by the grammar
 | 
			
		||||
	// If it is allowed then return it, otherwise evaluate the grammar on all
 | 
			
		||||
	// tokens and resample (slow)
 | 
			
		||||
	tokenData[0] = C.llama_token_data{id: id, logit: 1}
 | 
			
		||||
	tokenDataArray = C.llama_token_data_array{data: &tokenData[0], size: 1, selected: -1}
 | 
			
		||||
 | 
			
		||||
	C.llama_sampler_apply(s.grammar, &tokenDataArray)
 | 
			
		||||
	if !math.IsInf(float64(tokenData[0].logit), -1) {
 | 
			
		||||
		return int(id), nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for i := range tokenData {
 | 
			
		||||
		tokenData[i] = C.llama_token_data{id: C.llama_token(i), logit: C.float(logits[i])}
 | 
			
		||||
	}
 | 
			
		||||
	tokenDataArray = C.llama_token_data_array{data: &tokenData[0], size: C.size_t(len(tokenData)), selected: -1}
 | 
			
		||||
 | 
			
		||||
	C.llama_sampler_apply(s.grammar, &tokenDataArray)
 | 
			
		||||
	C.llama_sampler_apply(s.chain, &tokenDataArray)
 | 
			
		||||
 | 
			
		||||
	return int(tokenData[tokenDataArray.selected].id), nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *SamplingContext) Accept(id int, applyGrammar bool) {
 | 
			
		||||
	C.gpt_sampler_caccept(s.c, C.llama_token(id), C.bool(applyGrammar))
 | 
			
		||||
	if applyGrammar {
 | 
			
		||||
		C.llama_sampler_accept(s.grammar, C.llama_token(id))
 | 
			
		||||
	}
 | 
			
		||||
	C.llama_sampler_accept(s.chain, C.llama_token(id))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (s *SamplingContext) free() {
 | 
			
		||||
	if s != nil {
 | 
			
		||||
		C.llama_sampler_free(s.grammar)
 | 
			
		||||
		C.llama_sampler_free(s.chain)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										427
									
								
								llama/log.cpp
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										427
									
								
								llama/log.cpp
									
									
									
									
										vendored
									
									
								
							@@ -1,427 +0,0 @@
 | 
			
		||||
/**
 | 
			
		||||
 * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
 | 
			
		||||
 *
 | 
			
		||||
 * MIT License
 | 
			
		||||
 *
 | 
			
		||||
 * Copyright (c) 2023-2024 The ggml authors
 | 
			
		||||
 *
 | 
			
		||||
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 * of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 * in the Software without restriction, including without limitation the rights
 | 
			
		||||
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 * copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 * furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 * The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 * copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 * SOFTWARE.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include "log.h"
 | 
			
		||||
 | 
			
		||||
#include <condition_variable>
 | 
			
		||||
#include <cstdarg>
 | 
			
		||||
#include <cstdio>
 | 
			
		||||
#include <mutex>
 | 
			
		||||
#include <sstream>
 | 
			
		||||
#include <thread>
 | 
			
		||||
#include <vector>
 | 
			
		||||
 | 
			
		||||
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
 | 
			
		||||
 | 
			
		||||
void gpt_log_set_verbosity_thold(int verbosity) {
 | 
			
		||||
    gpt_log_verbosity_thold = verbosity;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define LOG_COL_DEFAULT "\033[0m"
 | 
			
		||||
#define LOG_COL_BOLD    "\033[1m"
 | 
			
		||||
#define LOG_COL_RED     "\033[31m"
 | 
			
		||||
#define LOG_COL_GREEN   "\033[32m"
 | 
			
		||||
#define LOG_COL_YELLOW  "\033[33m"
 | 
			
		||||
#define LOG_COL_BLUE    "\033[34m"
 | 
			
		||||
#define LOG_COL_MAGENTA "\033[35m"
 | 
			
		||||
#define LOG_COL_CYAN    "\033[36m"
 | 
			
		||||
#define LOG_COL_WHITE   "\033[37m"
 | 
			
		||||
 | 
			
		||||
static int64_t t_us() {
 | 
			
		||||
    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// colors
 | 
			
		||||
enum gpt_log_col : int {
 | 
			
		||||
    GPT_LOG_COL_DEFAULT = 0,
 | 
			
		||||
    GPT_LOG_COL_BOLD,
 | 
			
		||||
    GPT_LOG_COL_RED,
 | 
			
		||||
    GPT_LOG_COL_GREEN,
 | 
			
		||||
    GPT_LOG_COL_YELLOW,
 | 
			
		||||
    GPT_LOG_COL_BLUE,
 | 
			
		||||
    GPT_LOG_COL_MAGENTA,
 | 
			
		||||
    GPT_LOG_COL_CYAN,
 | 
			
		||||
    GPT_LOG_COL_WHITE,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// disable colors by default
 | 
			
		||||
static std::vector<const char *> g_col = {
 | 
			
		||||
    "",
 | 
			
		||||
    "",
 | 
			
		||||
    "",
 | 
			
		||||
    "",
 | 
			
		||||
    "",
 | 
			
		||||
    "",
 | 
			
		||||
    "",
 | 
			
		||||
    "",
 | 
			
		||||
    "",
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct gpt_log_entry {
 | 
			
		||||
    enum ggml_log_level level;
 | 
			
		||||
 | 
			
		||||
    bool prefix;
 | 
			
		||||
 | 
			
		||||
    int64_t timestamp;
 | 
			
		||||
 | 
			
		||||
    std::vector<char> msg;
 | 
			
		||||
 | 
			
		||||
    // signals the worker thread to stop
 | 
			
		||||
    bool is_end;
 | 
			
		||||
 | 
			
		||||
    void print(FILE * file = nullptr) const {
 | 
			
		||||
        FILE * fcur = file;
 | 
			
		||||
        if (!fcur) {
 | 
			
		||||
            // stderr displays DBG messages only when their verbosity level is not higher than the threshold
 | 
			
		||||
            // these messages will still be logged to a file
 | 
			
		||||
            if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
 | 
			
		||||
                return;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            fcur = stdout;
 | 
			
		||||
 | 
			
		||||
            if (level != GGML_LOG_LEVEL_NONE) {
 | 
			
		||||
                fcur = stderr;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
 | 
			
		||||
            if (timestamp) {
 | 
			
		||||
                // [M.s.ms.us]
 | 
			
		||||
                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
 | 
			
		||||
                        g_col[GPT_LOG_COL_BLUE],
 | 
			
		||||
                        (int) (timestamp / 1000000 / 60),
 | 
			
		||||
                        (int) (timestamp / 1000000 % 60),
 | 
			
		||||
                        (int) (timestamp / 1000 % 1000),
 | 
			
		||||
                        (int) (timestamp % 1000),
 | 
			
		||||
                        g_col[GPT_LOG_COL_DEFAULT]);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            switch (level) {
 | 
			
		||||
                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN],   g_col[GPT_LOG_COL_DEFAULT]); break;
 | 
			
		||||
                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], ""                        ); break;
 | 
			
		||||
                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED],     ""                        ); break;
 | 
			
		||||
                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW],  ""                        ); break;
 | 
			
		||||
                default:
 | 
			
		||||
                    break;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        fprintf(fcur, "%s", msg.data());
 | 
			
		||||
 | 
			
		||||
        if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
 | 
			
		||||
            fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        fflush(fcur);
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct gpt_log {
 | 
			
		||||
    // default capacity - will be expanded if needed
 | 
			
		||||
    gpt_log() : gpt_log(256) {}
 | 
			
		||||
 | 
			
		||||
    gpt_log(size_t capacity) {
 | 
			
		||||
        file = nullptr;
 | 
			
		||||
        prefix = false;
 | 
			
		||||
        timestamps = false;
 | 
			
		||||
        running = false;
 | 
			
		||||
        t_start = t_us();
 | 
			
		||||
 | 
			
		||||
        // initial message size - will be expanded if longer messages arrive
 | 
			
		||||
        entries.resize(capacity);
 | 
			
		||||
        for (auto & entry : entries) {
 | 
			
		||||
            entry.msg.resize(256);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        head = 0;
 | 
			
		||||
        tail = 0;
 | 
			
		||||
 | 
			
		||||
        resume();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    ~gpt_log() {
 | 
			
		||||
        pause();
 | 
			
		||||
        if (file) {
 | 
			
		||||
            fclose(file);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
    std::mutex mtx;
 | 
			
		||||
    std::thread thrd;
 | 
			
		||||
    std::condition_variable cv;
 | 
			
		||||
 | 
			
		||||
    FILE * file;
 | 
			
		||||
 | 
			
		||||
    bool prefix;
 | 
			
		||||
    bool timestamps;
 | 
			
		||||
    bool running;
 | 
			
		||||
 | 
			
		||||
    int64_t t_start;
 | 
			
		||||
 | 
			
		||||
    // ring buffer of entries
 | 
			
		||||
    std::vector<gpt_log_entry> entries;
 | 
			
		||||
    size_t head;
 | 
			
		||||
    size_t tail;
 | 
			
		||||
 | 
			
		||||
    // worker thread copies into this
 | 
			
		||||
    gpt_log_entry cur;
 | 
			
		||||
 | 
			
		||||
public:
 | 
			
		||||
    void add(enum ggml_log_level level, const char * fmt, va_list args) {
 | 
			
		||||
        std::lock_guard<std::mutex> lock(mtx);
 | 
			
		||||
 | 
			
		||||
        if (!running) {
 | 
			
		||||
            // discard messages while the worker thread is paused
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        auto & entry = entries[tail];
 | 
			
		||||
 | 
			
		||||
        {
 | 
			
		||||
            // cannot use args twice, so make a copy in case we need to expand the buffer
 | 
			
		||||
            va_list args_copy;
 | 
			
		||||
            va_copy(args_copy, args);
 | 
			
		||||
 | 
			
		||||
#if 1
 | 
			
		||||
            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
 | 
			
		||||
            if (n >= entry.msg.size()) {
 | 
			
		||||
                entry.msg.resize(n + 1);
 | 
			
		||||
                vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
 | 
			
		||||
            }
 | 
			
		||||
#else
 | 
			
		||||
            // hack for bolding arguments
 | 
			
		||||
 | 
			
		||||
            std::stringstream ss;
 | 
			
		||||
            for (int i = 0; fmt[i] != 0; i++) {
 | 
			
		||||
                if (fmt[i] == '%') {
 | 
			
		||||
                    ss << LOG_COL_BOLD;
 | 
			
		||||
                    while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
 | 
			
		||||
                    ss << LOG_COL_DEFAULT;
 | 
			
		||||
                    if (fmt[i] == 0) break;
 | 
			
		||||
                }
 | 
			
		||||
                ss << fmt[i];
 | 
			
		||||
            }
 | 
			
		||||
            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
 | 
			
		||||
            if (n >= entry.msg.size()) {
 | 
			
		||||
                entry.msg.resize(n + 1);
 | 
			
		||||
                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
 | 
			
		||||
            }
 | 
			
		||||
#endif
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        entry.level = level;
 | 
			
		||||
        entry.prefix = prefix;
 | 
			
		||||
        entry.timestamp = 0;
 | 
			
		||||
        if (timestamps) {
 | 
			
		||||
            entry.timestamp = t_us() - t_start;
 | 
			
		||||
        }
 | 
			
		||||
        entry.is_end = false;
 | 
			
		||||
 | 
			
		||||
        tail = (tail + 1) % entries.size();
 | 
			
		||||
        if (tail == head) {
 | 
			
		||||
            // expand the buffer
 | 
			
		||||
            std::vector<gpt_log_entry> new_entries(2*entries.size());
 | 
			
		||||
 | 
			
		||||
            size_t new_tail = 0;
 | 
			
		||||
 | 
			
		||||
            do {
 | 
			
		||||
                new_entries[new_tail] = std::move(entries[head]);
 | 
			
		||||
 | 
			
		||||
                head     = (head     + 1) % entries.size();
 | 
			
		||||
                new_tail = (new_tail + 1);
 | 
			
		||||
            } while (head != tail);
 | 
			
		||||
 | 
			
		||||
            head = 0;
 | 
			
		||||
            tail = new_tail;
 | 
			
		||||
 | 
			
		||||
            for (size_t i = tail; i < new_entries.size(); i++) {
 | 
			
		||||
                new_entries[i].msg.resize(256);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            entries = std::move(new_entries);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        cv.notify_one();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void resume() {
 | 
			
		||||
        std::lock_guard<std::mutex> lock(mtx);
 | 
			
		||||
 | 
			
		||||
        if (running) {
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        running = true;
 | 
			
		||||
 | 
			
		||||
        thrd = std::thread([this]() {
 | 
			
		||||
            while (true) {
 | 
			
		||||
                {
 | 
			
		||||
                    std::unique_lock<std::mutex> lock(mtx);
 | 
			
		||||
                    cv.wait(lock, [this]() { return head != tail; });
 | 
			
		||||
 | 
			
		||||
                    cur = entries[head];
 | 
			
		||||
 | 
			
		||||
                    head = (head + 1) % entries.size();
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                if (cur.is_end) {
 | 
			
		||||
                    break;
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                cur.print(); // stdout and stderr
 | 
			
		||||
 | 
			
		||||
                if (file) {
 | 
			
		||||
                    cur.print(file);
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        });
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void pause() {
 | 
			
		||||
        {
 | 
			
		||||
            std::lock_guard<std::mutex> lock(mtx);
 | 
			
		||||
 | 
			
		||||
            if (!running) {
 | 
			
		||||
                return;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            running = false;
 | 
			
		||||
 | 
			
		||||
            // push an entry to signal the worker thread to stop
 | 
			
		||||
            {
 | 
			
		||||
                auto & entry = entries[tail];
 | 
			
		||||
                entry.is_end = true;
 | 
			
		||||
 | 
			
		||||
                tail = (tail + 1) % entries.size();
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            cv.notify_one();
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        thrd.join();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void set_file(const char * path) {
 | 
			
		||||
        pause();
 | 
			
		||||
 | 
			
		||||
        if (file) {
 | 
			
		||||
            fclose(file);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (path) {
 | 
			
		||||
            file = fopen(path, "w");
 | 
			
		||||
        } else {
 | 
			
		||||
            file = nullptr;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        resume();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void set_colors(bool colors) {
 | 
			
		||||
        pause();
 | 
			
		||||
 | 
			
		||||
        if (colors) {
 | 
			
		||||
            g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
 | 
			
		||||
            g_col[GPT_LOG_COL_BOLD]    = LOG_COL_BOLD;
 | 
			
		||||
            g_col[GPT_LOG_COL_RED]     = LOG_COL_RED;
 | 
			
		||||
            g_col[GPT_LOG_COL_GREEN]   = LOG_COL_GREEN;
 | 
			
		||||
            g_col[GPT_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
 | 
			
		||||
            g_col[GPT_LOG_COL_BLUE]    = LOG_COL_BLUE;
 | 
			
		||||
            g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
 | 
			
		||||
            g_col[GPT_LOG_COL_CYAN]    = LOG_COL_CYAN;
 | 
			
		||||
            g_col[GPT_LOG_COL_WHITE]   = LOG_COL_WHITE;
 | 
			
		||||
        } else {
 | 
			
		||||
            for (size_t i = 0; i < g_col.size(); i++) {
 | 
			
		||||
                g_col[i] = "";
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        resume();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void set_prefix(bool prefix) {
 | 
			
		||||
        std::lock_guard<std::mutex> lock(mtx);
 | 
			
		||||
 | 
			
		||||
        this->prefix = prefix;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void set_timestamps(bool timestamps) {
 | 
			
		||||
        std::lock_guard<std::mutex> lock(mtx);
 | 
			
		||||
 | 
			
		||||
        this->timestamps = timestamps;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// public API
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
struct gpt_log * gpt_log_init() {
 | 
			
		||||
    return new gpt_log;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct gpt_log * gpt_log_main() {
 | 
			
		||||
    static struct gpt_log log;
 | 
			
		||||
 | 
			
		||||
    return &log;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_log_pause(struct gpt_log * log) {
 | 
			
		||||
    log->pause();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_log_resume(struct gpt_log * log) {
 | 
			
		||||
    log->resume();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_log_free(struct gpt_log * log) {
 | 
			
		||||
    delete log;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
 | 
			
		||||
    va_list args;
 | 
			
		||||
    va_start(args, fmt);
 | 
			
		||||
    log->add(level, fmt, args);
 | 
			
		||||
    va_end(args);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_log_set_file(struct gpt_log * log, const char * file) {
 | 
			
		||||
    log->set_file(file);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
 | 
			
		||||
    log->set_colors(colors);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
 | 
			
		||||
    log->set_prefix(prefix);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
 | 
			
		||||
    log->set_timestamps(timestamps);
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										118
									
								
								llama/log.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										118
									
								
								llama/log.h
									
									
									
									
										vendored
									
									
								
							@@ -1,118 +0,0 @@
 | 
			
		||||
/**
 | 
			
		||||
 * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
 | 
			
		||||
 *
 | 
			
		||||
 * MIT License
 | 
			
		||||
 *
 | 
			
		||||
 * Copyright (c) 2023-2024 The ggml authors
 | 
			
		||||
 *
 | 
			
		||||
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 * of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 * in the Software without restriction, including without limitation the rights
 | 
			
		||||
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 * copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 * furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 * The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 * copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 * SOFTWARE.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#include "ggml.h" // for ggml_log_level
 | 
			
		||||
 | 
			
		||||
#ifndef __GNUC__
 | 
			
		||||
#    define LOG_ATTRIBUTE_FORMAT(...)
 | 
			
		||||
#elif defined(__MINGW32__)
 | 
			
		||||
#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 | 
			
		||||
#else
 | 
			
		||||
#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define LOG_DEFAULT_DEBUG 1
 | 
			
		||||
#define LOG_DEFAULT_LLAMA 0
 | 
			
		||||
 | 
			
		||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
 | 
			
		||||
// set via gpt_log_set_verbosity()
 | 
			
		||||
extern int gpt_log_verbosity_thold;
 | 
			
		||||
 | 
			
		||||
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
 | 
			
		||||
 | 
			
		||||
// the gpt_log uses an internal worker thread to print/write log messages
 | 
			
		||||
// when the worker thread is paused, incoming log messages are discarded
 | 
			
		||||
struct gpt_log;
 | 
			
		||||
 | 
			
		||||
struct gpt_log * gpt_log_init();
 | 
			
		||||
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
 | 
			
		||||
void             gpt_log_pause (struct gpt_log * log); // pause  the worker thread, not thread-safe
 | 
			
		||||
void             gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
 | 
			
		||||
void             gpt_log_free  (struct gpt_log * log);
 | 
			
		||||
 | 
			
		||||
LOG_ATTRIBUTE_FORMAT(3, 4)
 | 
			
		||||
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
 | 
			
		||||
 | 
			
		||||
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
 | 
			
		||||
//
 | 
			
		||||
// regular log output:
 | 
			
		||||
//
 | 
			
		||||
//   ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
 | 
			
		||||
//   llm_load_tensors: ggml ctx size =    0.27 MiB
 | 
			
		||||
//   llm_load_tensors: offloading 32 repeating layers to GPU
 | 
			
		||||
//   llm_load_tensors: offloading non-repeating layers to GPU
 | 
			
		||||
//
 | 
			
		||||
// with prefix = true, timestamps = true, the log output will look like this:
 | 
			
		||||
//
 | 
			
		||||
//   0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
 | 
			
		||||
//   0.00.035.064 I llm_load_tensors: ggml ctx size =    0.27 MiB
 | 
			
		||||
//   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
 | 
			
		||||
//   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
 | 
			
		||||
//
 | 
			
		||||
// I - info    (stdout, V = 0)
 | 
			
		||||
// W - warning (stderr, V = 0)
 | 
			
		||||
// E - error   (stderr, V = 0)
 | 
			
		||||
// D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
void gpt_log_set_file      (struct gpt_log * log, const char * file);       // not thread-safe
 | 
			
		||||
void gpt_log_set_colors    (struct gpt_log * log,       bool   colors);     // not thread-safe
 | 
			
		||||
void gpt_log_set_prefix    (struct gpt_log * log,       bool   prefix);     // whether to output prefix to each log
 | 
			
		||||
void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // whether to output timestamps in the prefix
 | 
			
		||||
 | 
			
		||||
// helper macros for logging
 | 
			
		||||
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
 | 
			
		||||
//
 | 
			
		||||
// for example:
 | 
			
		||||
//
 | 
			
		||||
//   LOG_DBG("this is a debug message: %d\n", expensive_function());
 | 
			
		||||
//
 | 
			
		||||
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
#define LOG_TMPL(level, verbosity, ...) \
 | 
			
		||||
    do { \
 | 
			
		||||
        if ((verbosity) <= gpt_log_verbosity_thold) { \
 | 
			
		||||
            gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
 | 
			
		||||
        } \
 | 
			
		||||
    } while (0)
 | 
			
		||||
 | 
			
		||||
#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
 | 
			
		||||
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
 | 
			
		||||
 | 
			
		||||
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
 | 
			
		||||
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
 | 
			
		||||
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
 | 
			
		||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
 | 
			
		||||
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)
 | 
			
		||||
 | 
			
		||||
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 | 
			
		||||
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
 | 
			
		||||
#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
 | 
			
		||||
#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
 | 
			
		||||
#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
 | 
			
		||||
@@ -47,8 +47,8 @@ create-patches: $(LLAMACPP_REPO)
 | 
			
		||||
	git -C $(LLAMACPP_REPO) format-patch --no-signature --no-numbered --zero-commit -o $(LLAMACPP_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
 | 
			
		||||
 | 
			
		||||
# Vendoring template logic
 | 
			
		||||
EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
 | 
			
		||||
OLLAMA_NATIVE_FILES=mllama.cpp mllama.h llama_darwin.c sampling_ext.cpp sampling_ext.h
 | 
			
		||||
EXCLUDED_FILES=sgemm.cpp sgemm.h stb_image.h json.hpp llama_darwin.c base64.hpp
 | 
			
		||||
OLLAMA_NATIVE_FILES=mllama.cpp mllama.h llama_darwin.c
 | 
			
		||||
define vendor_file
 | 
			
		||||
$(strip $(addprefix $(2),$(notdir $1))) : $(addprefix $(LLAMACPP_REPO),$(1))
 | 
			
		||||
ifneq ($$(filter-out $(EXCLUDED_FILES),$(notdir $1)),)
 | 
			
		||||
@@ -149,20 +149,7 @@ LAVA_FILES= \
 | 
			
		||||
	examples/llava/clip.h \
 | 
			
		||||
	examples/llava/llava.cpp \
 | 
			
		||||
	examples/llava/llava.h \
 | 
			
		||||
	common/log.h \
 | 
			
		||||
	common/log.cpp \
 | 
			
		||||
	common/stb_image.h
 | 
			
		||||
# These files are mostly used by the llava code
 | 
			
		||||
# and shouldn't be necessary once we use clip.cpp directly
 | 
			
		||||
LAVA_FILES+= \
 | 
			
		||||
	common/common.cpp \
 | 
			
		||||
	common/common.h \
 | 
			
		||||
	common/sampling.cpp \
 | 
			
		||||
	common/sampling.h \
 | 
			
		||||
	common/json.hpp \
 | 
			
		||||
	common/json-schema-to-grammar.cpp \
 | 
			
		||||
	common/json-schema-to-grammar.h \
 | 
			
		||||
	common/base64.hpp
 | 
			
		||||
$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
 | 
			
		||||
 | 
			
		||||
$(DST_DIR)build-info.cpp:
 | 
			
		||||
 
 | 
			
		||||
@@ -475,7 +475,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		// sample a token
 | 
			
		||||
		token := seq.samplingCtx.Sample(s.lc, seq.iBatch)
 | 
			
		||||
		token, err := seq.samplingCtx.Sample(s.lc, seq.iBatch)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			slog.Error("failed to sample token", "error", err)
 | 
			
		||||
			s.removeSequence(i, "error")
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		seq.samplingCtx.Accept(token, true)
 | 
			
		||||
		piece := s.model.TokenToPiece(token)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										484
									
								
								llama/sampling.cpp
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										484
									
								
								llama/sampling.cpp
									
									
									
									
										vendored
									
									
								
							@@ -1,484 +0,0 @@
 | 
			
		||||
/**
 | 
			
		||||
 * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
 | 
			
		||||
 *
 | 
			
		||||
 * MIT License
 | 
			
		||||
 *
 | 
			
		||||
 * Copyright (c) 2023-2024 The ggml authors
 | 
			
		||||
 *
 | 
			
		||||
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 * of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 * in the Software without restriction, including without limitation the rights
 | 
			
		||||
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 * copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 * furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 * The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 * copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 * SOFTWARE.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include "sampling.h"
 | 
			
		||||
 | 
			
		||||
#include "common.h"
 | 
			
		||||
 | 
			
		||||
#include <cmath>
 | 
			
		||||
#include <unordered_map>
 | 
			
		||||
 | 
			
		||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
 | 
			
		||||
// TODO: deduplicate with llama-impl.h
 | 
			
		||||
template<typename T>
 | 
			
		||||
struct ring_buffer {
 | 
			
		||||
    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
 | 
			
		||||
 | 
			
		||||
    T & front() {
 | 
			
		||||
        if (sz == 0) {
 | 
			
		||||
            throw std::runtime_error("ring buffer is empty");
 | 
			
		||||
        }
 | 
			
		||||
        return data[first];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const T & front() const {
 | 
			
		||||
        if (sz == 0) {
 | 
			
		||||
            throw std::runtime_error("ring buffer is empty");
 | 
			
		||||
        }
 | 
			
		||||
        return data[first];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    T & back() {
 | 
			
		||||
        if (sz == 0) {
 | 
			
		||||
            throw std::runtime_error("ring buffer is empty");
 | 
			
		||||
        }
 | 
			
		||||
        return data[pos];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const T & back() const {
 | 
			
		||||
        if (sz == 0) {
 | 
			
		||||
            throw std::runtime_error("ring buffer is empty");
 | 
			
		||||
        }
 | 
			
		||||
        return data[pos];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void push_back(const T & value) {
 | 
			
		||||
        if (sz == capacity) {
 | 
			
		||||
            // advance the start when buffer is full
 | 
			
		||||
            first = (first + 1) % capacity;
 | 
			
		||||
        } else {
 | 
			
		||||
            sz++;
 | 
			
		||||
        }
 | 
			
		||||
        data[pos] = value;
 | 
			
		||||
        pos = (pos + 1) % capacity;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    T pop_front() {
 | 
			
		||||
        if (sz == 0) {
 | 
			
		||||
            throw std::runtime_error("ring buffer is empty");
 | 
			
		||||
        }
 | 
			
		||||
        T value = data[first];
 | 
			
		||||
        first = (first + 1) % capacity;
 | 
			
		||||
        sz--;
 | 
			
		||||
        return value;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const T & rat(size_t i) const {
 | 
			
		||||
        if (i >= sz) {
 | 
			
		||||
            throw std::runtime_error("ring buffer: index out of bounds");
 | 
			
		||||
        }
 | 
			
		||||
        return data[(first + sz - i - 1) % capacity];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    std::vector<T> to_vector() const {
 | 
			
		||||
        std::vector<T> result;
 | 
			
		||||
        result.reserve(sz);
 | 
			
		||||
        for (size_t i = 0; i < sz; i++) {
 | 
			
		||||
            result.push_back(data[(first + i) % capacity]);
 | 
			
		||||
        }
 | 
			
		||||
        return result;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void clear() {
 | 
			
		||||
        // here only reset the status of the buffer
 | 
			
		||||
        sz = 0;
 | 
			
		||||
        first = 0;
 | 
			
		||||
        pos = 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    bool empty() const {
 | 
			
		||||
        return sz == 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    size_t size() const {
 | 
			
		||||
        return sz;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    size_t capacity = 0;
 | 
			
		||||
    size_t sz = 0;
 | 
			
		||||
    size_t first = 0;
 | 
			
		||||
    size_t pos = 0;
 | 
			
		||||
    std::vector<T> data;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct gpt_sampler {
 | 
			
		||||
    gpt_sampler_params params;
 | 
			
		||||
 | 
			
		||||
    struct llama_sampler * grmr;
 | 
			
		||||
    struct llama_sampler * chain;
 | 
			
		||||
 | 
			
		||||
    ring_buffer<llama_token> prev;
 | 
			
		||||
 | 
			
		||||
    std::vector<llama_token_data> cur;
 | 
			
		||||
 | 
			
		||||
    llama_token_data_array cur_p;
 | 
			
		||||
 | 
			
		||||
    void set_logits(struct llama_context * ctx, int idx) {
 | 
			
		||||
        const auto * logits = llama_get_logits_ith(ctx, idx);
 | 
			
		||||
 | 
			
		||||
        const int n_vocab = llama_n_vocab(llama_get_model(ctx));
 | 
			
		||||
 | 
			
		||||
        cur.resize(n_vocab);
 | 
			
		||||
 | 
			
		||||
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
 | 
			
		||||
            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        cur_p = { cur.data(), cur.size(), -1, false };
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
std::string gpt_sampler_params::print() const {
 | 
			
		||||
    char result[1024];
 | 
			
		||||
 | 
			
		||||
    snprintf(result, sizeof(result),
 | 
			
		||||
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
 | 
			
		||||
            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
 | 
			
		||||
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
 | 
			
		||||
            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
 | 
			
		||||
            top_k, tfs_z, top_p, min_p, typ_p, temp,
 | 
			
		||||
            mirostat, mirostat_eta, mirostat_tau);
 | 
			
		||||
 | 
			
		||||
    return std::string(result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
 | 
			
		||||
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
 | 
			
		||||
 | 
			
		||||
    lparams.no_perf = params.no_perf;
 | 
			
		||||
 | 
			
		||||
    auto * result = new gpt_sampler {
 | 
			
		||||
        /* .params = */ params,
 | 
			
		||||
        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
 | 
			
		||||
        /* .chain  = */ llama_sampler_chain_init(lparams),
 | 
			
		||||
        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
 | 
			
		||||
        /* .cur    = */ {},
 | 
			
		||||
        /* .cur_p  = */ {},
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    llama_sampler_chain_add(result->chain,
 | 
			
		||||
            llama_sampler_init_logit_bias(
 | 
			
		||||
                llama_n_vocab(model),
 | 
			
		||||
                params.logit_bias.size(),
 | 
			
		||||
                params.logit_bias.data()));
 | 
			
		||||
 | 
			
		||||
    llama_sampler_chain_add(result->chain,
 | 
			
		||||
            llama_sampler_init_penalties(
 | 
			
		||||
                llama_n_vocab  (model),
 | 
			
		||||
                llama_token_eos(model),
 | 
			
		||||
                llama_token_nl (model),
 | 
			
		||||
                params.penalty_last_n,
 | 
			
		||||
                params.penalty_repeat,
 | 
			
		||||
                params.penalty_freq,
 | 
			
		||||
                params.penalty_present,
 | 
			
		||||
                params.penalize_nl,
 | 
			
		||||
                params.ignore_eos));
 | 
			
		||||
 | 
			
		||||
    if (params.temp > 0.0f) {
 | 
			
		||||
        if (params.mirostat == 0) {
 | 
			
		||||
            for (const auto & cnstr : params.samplers) {
 | 
			
		||||
                switch (cnstr) {
 | 
			
		||||
                    case GPT_SAMPLER_TYPE_TOP_K:
 | 
			
		||||
                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
 | 
			
		||||
                        break;
 | 
			
		||||
                    case GPT_SAMPLER_TYPE_TOP_P:
 | 
			
		||||
                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
 | 
			
		||||
                        break;
 | 
			
		||||
                    case GPT_SAMPLER_TYPE_MIN_P:
 | 
			
		||||
                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
 | 
			
		||||
                        break;
 | 
			
		||||
                    case GPT_SAMPLER_TYPE_TFS_Z:
 | 
			
		||||
                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
 | 
			
		||||
                        break;
 | 
			
		||||
                    case GPT_SAMPLER_TYPE_TYPICAL_P:
 | 
			
		||||
                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
 | 
			
		||||
                        break;
 | 
			
		||||
                    case GPT_SAMPLER_TYPE_TEMPERATURE:
 | 
			
		||||
                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
 | 
			
		||||
                        break;
 | 
			
		||||
                    default:
 | 
			
		||||
                        GGML_ASSERT(false && "unknown sampler type");
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
 | 
			
		||||
            llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
 | 
			
		||||
        } else if (params.mirostat == 1) {
 | 
			
		||||
            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
 | 
			
		||||
            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
 | 
			
		||||
        } else if (params.mirostat == 2) {
 | 
			
		||||
            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
 | 
			
		||||
            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
 | 
			
		||||
        } else {
 | 
			
		||||
            GGML_ASSERT(false && "unknown mirostat version");
 | 
			
		||||
        }
 | 
			
		||||
    } else {
 | 
			
		||||
        if (params.n_probs > 0) {
 | 
			
		||||
            // some use cases require to sample greedily, but still obtain the probabilities of the top tokens
 | 
			
		||||
            // ref: https://github.com/ggerganov/llama.cpp/pull/9605
 | 
			
		||||
            //
 | 
			
		||||
            // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
 | 
			
		||||
            // it is much faster, since we avoid sorting all tokens and should give a good approximation
 | 
			
		||||
            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
 | 
			
		||||
            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
 | 
			
		||||
        }
 | 
			
		||||
        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_sampler_free(struct gpt_sampler * gsmpl) {
 | 
			
		||||
    if (gsmpl) {
 | 
			
		||||
        llama_sampler_free(gsmpl->grmr);
 | 
			
		||||
 | 
			
		||||
        llama_sampler_free(gsmpl->chain);
 | 
			
		||||
 | 
			
		||||
        delete gsmpl;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
 | 
			
		||||
    if (accept_grammar) {
 | 
			
		||||
        llama_sampler_accept(gsmpl->grmr, token);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    llama_sampler_accept(gsmpl->chain, token);
 | 
			
		||||
 | 
			
		||||
    gsmpl->prev.push_back(token);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
 | 
			
		||||
    llama_sampler_reset(gsmpl->grmr);
 | 
			
		||||
 | 
			
		||||
    llama_sampler_reset(gsmpl->chain);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
 | 
			
		||||
    return new gpt_sampler {
 | 
			
		||||
        /* .params = */ gsmpl->params,
 | 
			
		||||
        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
 | 
			
		||||
        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
 | 
			
		||||
        /* .prev   = */ gsmpl->prev,
 | 
			
		||||
        /* .cur    = */ gsmpl->cur,
 | 
			
		||||
        /* .cur_p  = */ gsmpl->cur_p,
 | 
			
		||||
    };
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
 | 
			
		||||
    // TODO: measure grammar performance
 | 
			
		||||
 | 
			
		||||
    if (gsmpl) {
 | 
			
		||||
        llama_perf_sampler_print(gsmpl->chain);
 | 
			
		||||
    }
 | 
			
		||||
    if (ctx) {
 | 
			
		||||
        llama_perf_context_print(ctx);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
 | 
			
		||||
    gsmpl->set_logits(ctx, idx);
 | 
			
		||||
 | 
			
		||||
    auto & grmr  = gsmpl->grmr;
 | 
			
		||||
    auto & chain = gsmpl->chain;
 | 
			
		||||
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits
 | 
			
		||||
 | 
			
		||||
    if (grammar_first) {
 | 
			
		||||
        llama_sampler_apply(grmr, &cur_p);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    llama_sampler_apply(chain, &cur_p);
 | 
			
		||||
 | 
			
		||||
    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
 | 
			
		||||
 | 
			
		||||
    const llama_token id = cur_p.data[cur_p.selected].id;
 | 
			
		||||
 | 
			
		||||
    if (grammar_first) {
 | 
			
		||||
        return id;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // check if it the sampled token fits the grammar
 | 
			
		||||
    {
 | 
			
		||||
        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
 | 
			
		||||
        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
 | 
			
		||||
 | 
			
		||||
        llama_sampler_apply(grmr, &single_token_data_array);
 | 
			
		||||
 | 
			
		||||
        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
 | 
			
		||||
        if (is_valid) {
 | 
			
		||||
            return id;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // resampling:
 | 
			
		||||
    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
 | 
			
		||||
    gsmpl->set_logits(ctx, idx);
 | 
			
		||||
 | 
			
		||||
    llama_sampler_apply(grmr,  &cur_p);
 | 
			
		||||
    llama_sampler_apply(chain, &cur_p);
 | 
			
		||||
 | 
			
		||||
    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
 | 
			
		||||
 | 
			
		||||
    return cur_p.data[cur_p.selected].id;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
 | 
			
		||||
    return llama_sampler_get_seed(gsmpl->chain);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// helpers
 | 
			
		||||
 | 
			
		||||
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
 | 
			
		||||
    return &gsmpl->cur_p;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
 | 
			
		||||
    return gsmpl->prev.rat(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
 | 
			
		||||
    std::string result = "logits ";
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
 | 
			
		||||
        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
 | 
			
		||||
        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
 | 
			
		||||
    n = std::min(n, (int) gsmpl->prev.size());
 | 
			
		||||
 | 
			
		||||
    if (n <= 0) {
 | 
			
		||||
        return "";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    std::string result;
 | 
			
		||||
    result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
 | 
			
		||||
 | 
			
		||||
    for (int i = n - 1; i >= 0; i--) {
 | 
			
		||||
        const llama_token id = gsmpl->prev.rat(i);
 | 
			
		||||
 | 
			
		||||
        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
 | 
			
		||||
 | 
			
		||||
        result += llama_token_to_piece(ctx_main, id);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
 | 
			
		||||
    switch (cnstr) {
 | 
			
		||||
        case GPT_SAMPLER_TYPE_TOP_K:       return 'k';
 | 
			
		||||
        case GPT_SAMPLER_TYPE_TFS_Z:       return 'f';
 | 
			
		||||
        case GPT_SAMPLER_TYPE_TYPICAL_P:   return 'y';
 | 
			
		||||
        case GPT_SAMPLER_TYPE_TOP_P:       return 'p';
 | 
			
		||||
        case GPT_SAMPLER_TYPE_MIN_P:       return 'm';
 | 
			
		||||
        case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
 | 
			
		||||
        default : return '?';
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
 | 
			
		||||
    switch (cnstr) {
 | 
			
		||||
        case GPT_SAMPLER_TYPE_TOP_K:       return "top_k";
 | 
			
		||||
        case GPT_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
 | 
			
		||||
        case GPT_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
 | 
			
		||||
        case GPT_SAMPLER_TYPE_TOP_P:       return "top_p";
 | 
			
		||||
        case GPT_SAMPLER_TYPE_MIN_P:       return "min_p";
 | 
			
		||||
        case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
 | 
			
		||||
        default : return "";
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
 | 
			
		||||
    std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
 | 
			
		||||
        { "top_k",       GPT_SAMPLER_TYPE_TOP_K },
 | 
			
		||||
        { "top_p",       GPT_SAMPLER_TYPE_TOP_P },
 | 
			
		||||
        { "typ_p",       GPT_SAMPLER_TYPE_TYPICAL_P },
 | 
			
		||||
        { "min_p",       GPT_SAMPLER_TYPE_MIN_P },
 | 
			
		||||
        { "tfs_z",       GPT_SAMPLER_TYPE_TFS_Z },
 | 
			
		||||
        { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    // since samplers names are written multiple ways
 | 
			
		||||
    // make it ready for both system names and input names
 | 
			
		||||
    std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
 | 
			
		||||
        { "top-k",       GPT_SAMPLER_TYPE_TOP_K },
 | 
			
		||||
        { "top-p",       GPT_SAMPLER_TYPE_TOP_P },
 | 
			
		||||
        { "nucleus",     GPT_SAMPLER_TYPE_TOP_P },
 | 
			
		||||
        { "typical-p",   GPT_SAMPLER_TYPE_TYPICAL_P },
 | 
			
		||||
        { "typical",     GPT_SAMPLER_TYPE_TYPICAL_P },
 | 
			
		||||
        { "typ-p",       GPT_SAMPLER_TYPE_TYPICAL_P },
 | 
			
		||||
        { "typ",         GPT_SAMPLER_TYPE_TYPICAL_P },
 | 
			
		||||
        { "min-p",       GPT_SAMPLER_TYPE_MIN_P },
 | 
			
		||||
        { "tfs-z",       GPT_SAMPLER_TYPE_TFS_Z },
 | 
			
		||||
        { "tfs",         GPT_SAMPLER_TYPE_TFS_Z },
 | 
			
		||||
        { "temp",        GPT_SAMPLER_TYPE_TEMPERATURE },
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    std::vector<gpt_sampler_type> samplers;
 | 
			
		||||
    samplers.reserve(names.size());
 | 
			
		||||
 | 
			
		||||
    for (const auto & name : names) {
 | 
			
		||||
        auto sampler = sampler_canonical_name_map.find(name);
 | 
			
		||||
        if (sampler != sampler_canonical_name_map.end()) {
 | 
			
		||||
            samplers.push_back(sampler->second);
 | 
			
		||||
        } else {
 | 
			
		||||
            if (allow_alt_names) {
 | 
			
		||||
                sampler = sampler_alt_name_map.find(name);
 | 
			
		||||
                if (sampler != sampler_alt_name_map.end()) {
 | 
			
		||||
                    samplers.push_back(sampler->second);
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return samplers;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
 | 
			
		||||
    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
 | 
			
		||||
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
 | 
			
		||||
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
 | 
			
		||||
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
 | 
			
		||||
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P),       GPT_SAMPLER_TYPE_TOP_P },
 | 
			
		||||
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P),       GPT_SAMPLER_TYPE_MIN_P },
 | 
			
		||||
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    std::vector<gpt_sampler_type> samplers;
 | 
			
		||||
    samplers.reserve(chars.size());
 | 
			
		||||
 | 
			
		||||
    for (const auto & c : chars) {
 | 
			
		||||
        const auto sampler = sampler_name_map.find(c);
 | 
			
		||||
        if (sampler != sampler_name_map.end()) {
 | 
			
		||||
            samplers.push_back(sampler->second);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return samplers;
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										109
									
								
								llama/sampling.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										109
									
								
								llama/sampling.h
									
									
									
									
										vendored
									
									
								
							@@ -1,109 +0,0 @@
 | 
			
		||||
/**
 | 
			
		||||
 * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
 | 
			
		||||
 *
 | 
			
		||||
 * MIT License
 | 
			
		||||
 *
 | 
			
		||||
 * Copyright (c) 2023-2024 The ggml authors
 | 
			
		||||
 *
 | 
			
		||||
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 * of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 * in the Software without restriction, including without limitation the rights
 | 
			
		||||
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 * copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 * furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 * The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 * copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 * SOFTWARE.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#include "llama.h"
 | 
			
		||||
 | 
			
		||||
#include "common.h"
 | 
			
		||||
 | 
			
		||||
#include <string>
 | 
			
		||||
#include <vector>
 | 
			
		||||
 | 
			
		||||
// gpt_sampler extends llama_sampler with additional functionality:
 | 
			
		||||
//
 | 
			
		||||
//  - grammar support
 | 
			
		||||
//  - custom sampler logic based on the parameters
 | 
			
		||||
//  - history of the last accepted tokens
 | 
			
		||||
//  - performance metrics
 | 
			
		||||
//
 | 
			
		||||
// This goal is to have a common implementation of the sampling logic shared across the examples.
 | 
			
		||||
// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
 | 
			
		||||
// complex (top-k, top-p, etc).
 | 
			
		||||
//
 | 
			
		||||
// Another example is related to the grammar. In general, the grammar constraints applied on the full
 | 
			
		||||
// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
 | 
			
		||||
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
 | 
			
		||||
// grammar constraints are applied to the full vocabulary and the token is resampled.
 | 
			
		||||
//
 | 
			
		||||
// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
 | 
			
		||||
// be moved into the core llama library.
 | 
			
		||||
//
 | 
			
		||||
// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
 | 
			
		||||
// This can be used to access the probabilities of the rest of the non-sampled tokens.
 | 
			
		||||
//
 | 
			
		||||
// TODO: measure grammar performance
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
struct gpt_sampler;
 | 
			
		||||
 | 
			
		||||
// llama_sampler API overloads
 | 
			
		||||
 | 
			
		||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
 | 
			
		||||
 | 
			
		||||
void gpt_sampler_free(struct gpt_sampler * gsmpl);
 | 
			
		||||
 | 
			
		||||
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
 | 
			
		||||
void                 gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
 | 
			
		||||
void                 gpt_sampler_reset (struct gpt_sampler * gsmpl);
 | 
			
		||||
struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
 | 
			
		||||
 | 
			
		||||
// arguments can be nullptr to skip printing
 | 
			
		||||
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
 | 
			
		||||
 | 
			
		||||
// extended sampling implementation:
 | 
			
		||||
//
 | 
			
		||||
// - set logits
 | 
			
		||||
// - apply the configured sampler chain
 | 
			
		||||
// - check if the token fits the grammar (if any)
 | 
			
		||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 | 
			
		||||
//
 | 
			
		||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
 | 
			
		||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
 | 
			
		||||
//
 | 
			
		||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
 | 
			
		||||
 | 
			
		||||
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
 | 
			
		||||
 | 
			
		||||
// helpers
 | 
			
		||||
 | 
			
		||||
// access the internal list of current candidate tokens
 | 
			
		||||
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
 | 
			
		||||
 | 
			
		||||
// get the last accepted token
 | 
			
		||||
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
 | 
			
		||||
 | 
			
		||||
// print the sampler chain into a string
 | 
			
		||||
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
 | 
			
		||||
 | 
			
		||||
// get a string representation of the last accepted tokens
 | 
			
		||||
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
 | 
			
		||||
 | 
			
		||||
char        gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
 | 
			
		||||
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
 | 
			
		||||
 | 
			
		||||
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 | 
			
		||||
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
 | 
			
		||||
							
								
								
									
										56
									
								
								llama/sampling_ext.cpp
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										56
									
								
								llama/sampling_ext.cpp
									
									
									
									
										vendored
									
									
								
							@@ -1,56 +0,0 @@
 | 
			
		||||
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
 | 
			
		||||
#include "sampling.h"
 | 
			
		||||
#include "sampling_ext.h"
 | 
			
		||||
 | 
			
		||||
struct gpt_sampler *gpt_sampler_cinit(
 | 
			
		||||
    const struct llama_model *model, struct gpt_sampler_cparams *params)
 | 
			
		||||
{
 | 
			
		||||
    try {
 | 
			
		||||
        gpt_sampler_params sparams;
 | 
			
		||||
        sparams.top_k = params->top_k;
 | 
			
		||||
        sparams.top_p = params->top_p;
 | 
			
		||||
        sparams.min_p = params->min_p;
 | 
			
		||||
        sparams.tfs_z = params->tfs_z;
 | 
			
		||||
        sparams.typ_p = params->typical_p;
 | 
			
		||||
        sparams.temp = params->temp;
 | 
			
		||||
        sparams.penalty_last_n = params->penalty_last_n;
 | 
			
		||||
        sparams.penalty_repeat = params->penalty_repeat;
 | 
			
		||||
        sparams.penalty_freq = params->penalty_freq;
 | 
			
		||||
        sparams.penalty_present = params->penalty_present;
 | 
			
		||||
        sparams.mirostat = params->mirostat;
 | 
			
		||||
        sparams.mirostat_tau = params->mirostat_tau;
 | 
			
		||||
        sparams.mirostat_eta = params->mirostat_eta;
 | 
			
		||||
        sparams.penalize_nl = params->penalize_nl;
 | 
			
		||||
        sparams.seed = params->seed;
 | 
			
		||||
        sparams.grammar = params->grammar;
 | 
			
		||||
        return gpt_sampler_init(model, sparams);
 | 
			
		||||
    } catch (const std::exception & err) {
 | 
			
		||||
        return nullptr;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_sampler_cfree(struct gpt_sampler *sampler)
 | 
			
		||||
{
 | 
			
		||||
    gpt_sampler_free(sampler);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_sampler_creset(struct gpt_sampler *sampler)
 | 
			
		||||
{
 | 
			
		||||
    gpt_sampler_reset(sampler);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
llama_token gpt_sampler_csample(
 | 
			
		||||
    struct gpt_sampler *sampler,
 | 
			
		||||
    struct llama_context *ctx_main,
 | 
			
		||||
    int idx)
 | 
			
		||||
{
 | 
			
		||||
    return gpt_sampler_sample(sampler, ctx_main, idx);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gpt_sampler_caccept(
 | 
			
		||||
    struct gpt_sampler *sampler,
 | 
			
		||||
    llama_token id,
 | 
			
		||||
    bool apply_grammar)
 | 
			
		||||
{
 | 
			
		||||
    gpt_sampler_accept(sampler, id, apply_grammar);
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										54
									
								
								llama/sampling_ext.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										54
									
								
								llama/sampling_ext.h
									
									
									
									
										vendored
									
									
								
							@@ -1,54 +0,0 @@
 | 
			
		||||
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
 | 
			
		||||
#ifndef GPT_SAMPLER_EXT_H
 | 
			
		||||
#define GPT_SAMPLER_EXT_H
 | 
			
		||||
 | 
			
		||||
#ifdef __cplusplus
 | 
			
		||||
extern "C"
 | 
			
		||||
{
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    // Forward declaration to avoid include of "sampling.h" which has c++
 | 
			
		||||
    // includes
 | 
			
		||||
    struct gpt_sampler;
 | 
			
		||||
 | 
			
		||||
    struct gpt_sampler_cparams
 | 
			
		||||
    {
 | 
			
		||||
        int32_t top_k;
 | 
			
		||||
        float top_p;
 | 
			
		||||
        float min_p;
 | 
			
		||||
        float tfs_z;
 | 
			
		||||
        float typical_p;
 | 
			
		||||
        float temp;
 | 
			
		||||
        int32_t penalty_last_n;
 | 
			
		||||
        float penalty_repeat;
 | 
			
		||||
        float penalty_freq;
 | 
			
		||||
        float penalty_present;
 | 
			
		||||
        int32_t mirostat;
 | 
			
		||||
        float mirostat_tau;
 | 
			
		||||
        float mirostat_eta;
 | 
			
		||||
        bool penalize_nl;
 | 
			
		||||
        uint32_t seed;
 | 
			
		||||
        char *grammar;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    struct gpt_sampler *gpt_sampler_cinit(
 | 
			
		||||
        const struct llama_model *model,
 | 
			
		||||
        struct gpt_sampler_cparams *params);
 | 
			
		||||
    void gpt_sampler_cfree(struct gpt_sampler *sampler);
 | 
			
		||||
    void gpt_sampler_creset(struct gpt_sampler *sampler);
 | 
			
		||||
 | 
			
		||||
    llama_token gpt_sampler_csample(
 | 
			
		||||
        struct gpt_sampler *sampler,
 | 
			
		||||
        struct llama_context *ctx_main,
 | 
			
		||||
        int idx);
 | 
			
		||||
 | 
			
		||||
    void gpt_sampler_caccept(
 | 
			
		||||
        struct gpt_sampler *sampler,
 | 
			
		||||
        llama_token id,
 | 
			
		||||
        bool apply_grammar);
 | 
			
		||||
 | 
			
		||||
#ifdef __cplusplus
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif // GPT_SAMPLER_EXT_H
 | 
			
		||||
		Reference in New Issue
	
	Block a user