Your IP : 3.17.61.107


Current Path : /var/www/wptbox/wp-content/plugins/astra-sites/inc/lib/zip-ai/classes/
Upload File :
Current File : /var/www/wptbox/wp-content/plugins/astra-sites/inc/lib/zip-ai/classes/token-calculator.php

<?php
/**
 * Zip AI - Token Calculator
 *
 * @package zip-ai
 */

namespace ZipAI\Classes;

// Exit if accessed directly.
if ( ! defined( 'ABSPATH' ) ) {
	exit;
}

/**
 * The Token_Calculator Class.
 *
 * @since 1.0.0
 */
class Token_Calculator {
	/**
	 * Get the GPT encoded tokens.
	 *
	 * @param string $text The text to encode.
	 * @since 1.0.0
	 * @return array The encoded tokens.
	 */
	public static function gpt_encode( $text ) {
		$bpe_tokens = [];
		// If the text is empty, then abandon ship.
		if ( empty( $text ) ) {
			return $bpe_tokens;
		}
		// Load the characters.
		$raw_chars    = file_get_contents( ZIP_AI_DIR . 'assets/open-ai/json/characters.json' ); // phpcs:ignore WordPress.WP.AlternativeFunctions.file_get_contents_file_get_contents
		$byte_encoder = json_decode( $raw_chars, true );
		// If the characters are empty, then abandon ship.
		if ( empty( $byte_encoder ) ) {
			return $bpe_tokens;
		}
		// Load the encoder.
		$rencoder = file_get_contents( ZIP_AI_DIR . 'assets/open-ai/json/encoder.json' ); // phpcs:ignore WordPress.WP.AlternativeFunctions.file_get_contents_file_get_contents
		$encoder  = json_decode( $rencoder, true );
		// If the encoder is empty, then abandon ship.
		if ( empty( $encoder ) ) {
			return $bpe_tokens;
		}
		// Load the vocabulary.
		$bpe_file = file_get_contents( ZIP_AI_DIR . 'assets/open-ai/json/vocab.bpe' ); // phpcs:ignore WordPress.WP.AlternativeFunctions.file_get_contents_file_get_contents
		// If the vocabulary is empty, then abandon ship.
		if ( empty( $bpe_file ) ) {
			return $bpe_tokens;
		}
		// Match the text with the regex.
		preg_match_all( "#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches );
		// If the matches are empty, then abandon ship.
		if ( ! isset( $matches[0] ) || 0 === count( $matches[0] ) ) {
			return $bpe_tokens;
		}
		// Load the BPE merges.
		$lines           = preg_split( '/\r\n|\r|\n/', $bpe_file );
		$bpe_merges      = [];
		$bpe_merges_temp = array_slice( $lines, 1, count( $lines ), true );
		foreach ( $bpe_merges_temp as $bmt ) {
			$split_bmt = preg_split( '#(\s+)#', $bmt );
			$split_bmt = array_filter( $split_bmt, [ 'ZipAI\Classes\Token_Calculator', 'gpt_filter' ] );
			if ( count( $split_bmt ) > 0 ) {
				$bpe_merges[] = $split_bmt;
			}
		}
		// Load the BPE ranks.
		$bpe_ranks = self::gpt_dict_zip( $bpe_merges, range( 0, count( $bpe_merges ) - 1 ) );
		$cache     = [];
		// Loop through the matches.
		foreach ( $matches[0] as $token ) {
			$new_tokens = [];
			$chars      = [];
			$token      = self::gpt_utf8_encode( $token );
			// Either use mb_strlen to get the length of the token in UTF-8 or use str_split.
			// This is for backwards compatibility.
			if ( function_exists( 'mb_strlen' ) ) {
				$len = mb_strlen( $token, 'UTF-8' );
				for ( $i = 0; $i < $len; $i++ ) {
					$chars[] = mb_substr( $token, $i, 1, 'UTF-8' );
				}
			} else {
				$chars = str_split( $token );
			}
			$result_word = '';
			// Loop through the characters and encode them.
			foreach ( $chars as $char ) {
				if ( isset( $byte_encoder[ self::gpt_unichr( $char ) ] ) ) {
					$result_word .= $byte_encoder[ self::gpt_unichr( $char ) ];
				}
			}
			// Encode the BPE.
			$new_tokens_bpe = self::gpt_bpe( $result_word, $bpe_ranks, $cache );
			$new_tokens_bpe = explode( ' ', $new_tokens_bpe );
			// Loop through the BPE tokens and encode them.
			foreach ( $new_tokens_bpe as $x ) {
				if ( isset( $encoder[ $x ] ) ) {
					if ( isset( $new_tokens[ $x ] ) ) {
						$new_tokens[ wp_rand() . '---' . $x ] = $encoder[ $x ];
					} else {
						$new_tokens[ $x ] = $encoder[ $x ];
					}
				} else {
					if ( isset( $new_tokens[ $x ] ) ) {
						$new_tokens[ wp_rand() . '---' . $x ] = $x;
					} else {
						$new_tokens[ $x ] = $x;
					}
				}
			}
			// Loop through the new tokens and add them to the BPE tokens.
			foreach ( $new_tokens as $ninx => $nval ) {
				if ( isset( $bpe_tokens[ $ninx ] ) ) {
					$bpe_tokens[ wp_rand() . '---' . $ninx ] = $nval;
				} else {
					$bpe_tokens[ $ninx ] = $nval;
				}
			}
		}
		// Return the encoded BPE tokens.
		return $bpe_tokens;
	}

	/**
	 * Get the ranks of the BPE merges.
	 *
	 * @param array $x The BPE merges.
	 * @param array $y The range.
	 * @since 1.0.0
	 * @return array The ranks.
	 */
	public static function gpt_dict_zip( $x, $y ) {
		$result = [];
		$cnt    = 0;
		foreach ( $x as $i ) {
			if ( isset( $i[1] ) && isset( $i[0] ) ) {
				$result[ $i[0] . ',' . $i[1] ] = $cnt;
				$cnt++;
			}
		}
		return $result;
	}

	/**
	 * Get the UTF-8 character of the given string/token.
	 *
	 * @param string $str The string/token.
	 * @since 1.0.0
	 * @return string The UTF-8 character.
	 */
	public static function gpt_utf8_encode( $str ) {
		$str .= $str;
		$len  = strlen( $str );
		for ( $i = $len >> 1, $j = 0; $i < $len; ++$i, ++$j ) {
			switch ( true ) {
				case $str[ $i ] < "\x80":
					$str[ $j ] = $str[ $i ];
					break;
				case $str[ $i ] < "\xC0":
					$str[ $j ]   = "\xC2";
					$str[ ++$j ] = $str[ $i ];
					break;
				default:
					$str[ $j ]   = "\xC3";
					$str[ ++$j ] = chr( ord( $str[ $i ] ) - 64 );
					break;
			}
		}

		return substr( $str, 0, $j );
	}

	/**
	 * Get the byte size of the given character.
	 *
	 * @param string $c The character.
	 * @since 1.0.0
	 * @return int The byte size.
	 */
	public static function gpt_unichr( $c ) {
		if ( ord( $c[0] ) >= 0 && ord( $c[0] ) <= 127 ) {
			return ord( $c[0] );
		}
		if ( ord( $c[0] ) >= 192 && ord( $c[0] ) <= 223 ) {
			return ( ord( $c[0] ) - 192 ) * 64 + ( ord( $c[1] ) - 128 );
		}
		if ( ord( $c[0] ) >= 224 && ord( $c[0] ) <= 239 ) {
			return ( ord( $c[0] ) - 224 ) * 4096 + ( ord( $c[1] ) - 128 ) * 64 + ( ord( $c[2] ) - 128 );
		}
		if ( ord( $c[0] ) >= 240 && ord( $c[0] ) <= 247 ) {
			return ( ord( $c[0] ) - 240 ) * 262144 + ( ord( $c[1] ) - 128 ) * 4096 + ( ord( $c[2] ) - 128 ) * 64 + ( ord( $c[3] ) - 128 );
		}
		if ( ord( $c[0] ) >= 248 && ord( $c[0] ) <= 251 ) {
			return ( ord( $c[0] ) - 248 ) * 16777216 + ( ord( $c[1] ) - 128 ) * 262144 + ( ord( $c[2] ) - 128 ) * 4096 + ( ord( $c[3] ) - 128 ) * 64 + ( ord( $c[4] ) - 128 );
		}
		if ( ord( $c[0] ) >= 252 && ord( $c[0] ) <= 253 ) {
			return ( ord( $c[0] ) - 252 ) * 1073741824 + ( ord( $c[1] ) - 128 ) * 16777216 + ( ord( $c[2] ) - 128 ) * 262144 + ( ord( $c[3] ) - 128 ) * 4096 + ( ord( $c[4] ) - 128 ) * 64 + ( ord( $c[5] ) - 128 );
		}
		if ( ord( $c[0] ) >= 254 && ord( $c[0] ) <= 255 ) {
			return 0;
		}

		return 0;
	}

	/**
	 * Get the encoded BPE tokens.
	 *
	 * @param string $token The token.
	 * @param array  $bpe_ranks The BPE ranks.
	 * @param array  $cache The cache.
	 */
	public static function gpt_bpe( $token, $bpe_ranks, &$cache ) {
		// Check if the token is in the cache.
		if ( array_key_exists( $token, $cache ) ) {
			return $cache[ $token ];
		}
		// Split the token into UTF-8 characters.
		$word     = self::gpt_split( $token );
		$init_len = count( $word );
		$pairs    = self::gpt_get_pairs( $word );
		// If there are no pairs, return the token.
		if ( ! $pairs ) {
			return $token;
		}
		// Loop through the pairs.
		while ( true ) {
			$min_pairs = [];
			// Get the minimum pair.
			foreach ( $pairs as $pair ) {
				if ( array_key_exists( $pair[0] . ',' . $pair[1], $bpe_ranks ) ) {
					$rank               = $bpe_ranks[ $pair[0] . ',' . $pair[1] ];
					$min_pairs[ $rank ] = $pair;
				} else {
					$min_pairs[ 10e10 ] = $pair;
				}
			}
			// Sort the minimum pairs.
			ksort( $min_pairs );
			$min_key = array_key_first( $min_pairs );
			// Loop through the minimum pairs.
			foreach ( $min_pairs as $mpi => $mp ) {
				if ( $mpi < $min_key ) {
					$min_key = $mpi;
				}
			}
			$bigram = $min_pairs[ $min_key ];
			// If the bigram is not in the BPE ranks, break.
			if ( ! array_key_exists( $bigram[0] . ',' . $bigram[1], $bpe_ranks ) ) {
				break;
			}
			$first       = $bigram[0];
			$second      = $bigram[1];
			$new_word    = [];
			$i           = 0;
			$word_length = count( $word );
			// Loop through the word.
			while ( $i < $word_length ) {
				// Get the index of the first bigram.
				$j = self::gpt_index_of( $word, $first, $i );
				// If the index is -1, add the rest of the word to the new word and break.
				if ( -1 === $j ) {
					$new_word = array_merge( $new_word, array_slice( $word, $i, null, true ) );
					break;
				}
				// If the index is not 0, add the rest of the word to the new word.
				if ( $i > $j ) {
					$slicer = [];
				} elseif ( 0 === $j ) {
					$slicer = [];
				} else {
					$slicer = array_slice( $word, $i, $j - $i, true );
				}
				$new_word = array_merge( $new_word, $slicer );
				// If the length of the new word is greater than the initial length, break.
				if ( count( $new_word ) > $init_len ) {
					break;
				}
				$i = $j;
				// If the next character is the second bigram, add the bigram to the new word - otherwise, add the character to the new word.
				if ( $word[ $i ] === $first && $i < count( $word ) - 1 && $word[ $i + 1 ] === $second ) {
					array_push( $new_word, $first . $second );
					$i = $i + 2;
				} else {
					array_push( $new_word, $word[ $i ] );
					++$i;
				}
			}
			// If the new word is the same as the old word, break.
			if ( $word === $new_word ) {
				break;
			}
			$word        = $new_word;
			$word_length = count( $word );
			// If the length of the word is 1, break - otherwise, get the pairs.
			if ( 1 === $word_length ) {
				break;
			} else {
				$pairs = self::gpt_get_pairs( $word );
			}
		}
		$word            = implode( ' ', $word );
		$cache[ $token ] = $word;

		// Return the word.
		return $word;
	}

	/**
	 * Split a string into UTF-8 characters.
	 *
	 * @param string $str The string.
	 * @param int    $len The length - default 1.
	 * @since 1.0.0
	 * @return array The UTF-8 characters.
	 */
	public static function gpt_split( $str, $len = 1 ) {
		$arr = [];
		// Handle backwards compatibility.
		if ( function_exists( 'mb_strlen' ) ) {
			$length = mb_strlen( $str, 'UTF-8' );
		} else {
			$length = strlen( $str );
		}

		// Loop through the string characters, forming an array of UTF-8 characters.
		for ( $i = 0; $i < $length; $i += $len ) {
			if ( function_exists( 'mb_substr' ) ) {
				$arr[] = mb_substr( $str, $i, $len, 'UTF-8' );
			} else {
				$arr[] = substr( $str, $i, $len );
			}
		}

		return $arr;

	}

	/**
	 * Get the pairs of a word.
	 *
	 * @param string $word The word.
	 * @since 1.0.0
	 * @return array The pairs.
	 */
	public static function gpt_get_pairs( $word ) {
		$pairs       = [];
		$prev_char   = $word[0];
		$word_length = count( $word );
		for ( $i = 1; $i < $word_length; $i++ ) {
			$char      = $word[ $i ];
			$pairs[]   = [ $prev_char, $char ];
			$prev_char = $char;
		}

		return $pairs;
	}

	/**
	 * Get the index of a value in an array.
	 *
	 * @param array  $arrax The array.
	 * @param string $search_element The value to search for.
	 * @param int    $from_index The index to start searching from.
	 * @since 1.0.0
	 * @return int The index.
	 */
	public static function gpt_index_of( $arrax, $search_element, $from_index ) {
		$index = 0;
		foreach ( $arrax as $index => $value ) {
			if ( $index < $from_index ) {
				$index++;

				continue;
			}
			if ( $value === $search_element ) {
				return $index;
			}
			$index++;
		}

		return -1;
	}

	/**
	 * Filter a variable.
	 *
	 * @param mixed $var The variable.
	 * @since 1.0.0
	 * @return bool Whether the variable is not null, false, or empty.
	 */
	public static function gpt_filter( $var ) {
		return ! in_array( $var, [ null, false, '' ], true );
	}
}