Кодирование и декодирование Unicode строк с SCSU

Опубликовано: 05.09.2018

Этот PHP класс может кодировать и декодировать строки Unicode с SCSU (стандартная схема сжатия юникода). Он принимает текстовую строку и кодирует ее с помощью SCSU, чтобы сделать лучше сжатие с использованием алгоритма LZF. Класс также может сделать наоборот, то есть, декодировать текстовую строку ранее закодированную с использованием SCSU.

Стандарт Unicode определяет несколько способов кодирования текста. Текстовая строка может кодироваться по-разному. Этот класс может кодировать текстовую строку способом, который является наиболее эффективным для уменьшения его размера с алгоритмами сжатия.

Лицензия GPL.

Исходник класса SCSU

Скачать полный архив со скриптом.

require_once 'UTF8.php'; class SCSU { // class constants const SQ0 = 0x01; const SQ1 = 0x02; const SQ2 = 0x03; const SQ3 = 0x04; const SQ4 = 0x05; const SQ5 = 0x06; const SQ6 = 0x07; const SQ7 = 0x08; const SDX = 0x0B; const Srs = 0x0C; const SQU = 0x0E; const SCU = 0x0F; const SC0 = 0x10; const SC1 = 0x11; const SC2 = 0x12; const SC3 = 0x13; const SC4 = 0x14; const SC5 = 0x15; const SC6 = 0x16; const SC7 = 0x17; const SD0 = 0x18; const SD1 = 0x19; const SD2 = 0x1A; const SD3 = 0x1B; const SD4 = 0x1C; const SD5 = 0x1D; const SD6 = 0x1E; const SD7 = 0x1F; const UC0 = 0xE0; const UC1 = 0xE1; const UC2 = 0xE2; const UC3 = 0xE3; const UC4 = 0xE4; const UC5 = 0xE5; const UC6 = 0xE6; const UC7 = 0xE7; const UD0 = 0xE8; const UD1 = 0xE9; const UD2 = 0xEA; const UD3 = 0xEB; const UD4 = 0xEC; const UD5 = 0xED; const UD6 = 0xEE; const UD7 = 0xEF; const UQU = 0xF0; const UDX = 0xF1; const Urs = 0xF2; const gapThreshold = 0x68; const gapOffset = 0xAC00; const reservedStart = 0xA8; const fixedThreshold = 0xF9; const cpMaxNum = 10000; // private class fields private $dynamicOffset; private $selectedWindow; private $iIn = 0; private $iInLen = 0; private $iSCU = -1; private $fUnicodeMode = false; private $iNextWindow = 3; private static $staticOffset = array( 0x0000, 0x0080, 0x0100, 0x0300, 0x2000, 0x2080, 0x2100, 0x3000 ); private static $initialDynamicOffset = array( 0x0080, 0x00C0, 0x0400, 0x0600, 0x0900, 0x3040, 0x30A0, 0xFF00 ); private static $fixedOffset = array( 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60 ); // methods // public methods // compression related public methods /** * Compresses UTF-8 string using to SCSU algorithm * * @param &string $sIn Reference to UTF-8 string * @return string SCSU-compressed string * @throws SCSU_Exception */ public function compress(&$sIn) { $this->reset(); $this->fUnicodeMode = false; $this->iSCU = - 1; $this->iNextWindow = 3; $this->sOut = ''; $inLen = strlen($sIn); $inNext = 0; do{ $this->aIn = array(); $inNext = UTF8::strToCodepoints($sIn, $this->aIn, $inNext, self::cpMaxNum); $this->compress_part(); }while($inNext < $inLen); unset($this->aIn); return $this->sOut; } // decompression related public methods /** * Decompresses SCSU-compressed string to UTF-8 one * * @param &string $sIn Reference to SCSU-compressed string * @return string UTF-8 string * @throws SCSU_Exception */ public function decompress(&$sIn) { $this->reset(); $iInLen = strlen($sIn); $sOut = ''; for( $iCur = 0; $iCur < $iInLen; $iCur++ ) { $iStaticWindow = 0; $iDynamicWindow = $this->selectedWindow; switch(ord($sIn{$iCur})) { case self::SQ0: case self::SQ1: case self::SQ2: case self::SQ3: case self::SQ4: case self::SQ5: case self::SQ6: case self::SQ7: if( $iCur >= $iInLen - 1) break 2; $iDynamicWindow = $iStaticWindow = ord($sIn{$iCur}) - self::SQ0; $iCur ++; default: if(ord($sIn{$iCur}) < 128) { $ch = ord($sIn{$iCur}) + self::$staticOffset[$iStaticWindow]; $sOut .= UTF8::codepointToStr($ch); } else { $ch = ord($sIn{$iCur}); $ch -= 0x80; $ch += $this->dynamicOffset[$iDynamicWindow]; if ($ch < 1<<16) { $sOut .= UTF8::codepointToStr($ch); } else { $ch -= 0x10000; $sOut .= UTF8::codepointToStr(0xD800 + ($ch>>10)); $sOut .= UTF8::codepointToStr(0xDC00 + ($ch & ~0xFC00)); } } break; case self::SDX: $iCur += 2; if( $iCur >= $iInLen) break 2; $this->defineExtendedWindow($this->charFromTwoBytes(ord($sIn{$iCur-1}), ord($sIn{$iCur}))); break; case self::SD0: case self::SD1: case self::SD2: case self::SD3: case self::SD4: case self::SD5: case self::SD6: case self::SD7: $iCur ++; if( $iCur >= $iInLen) break 2; $this->defineWindow(ord($sIn{$iCur-1}) - self::SD0, ord($sIn{$iCur})); break; case self::SC0: case self::SC1: case self::SC2: case self::SC3: case self::SC4: case self::SC5: case self::SC6: case self::SC7: $this->selectedWindow = ord($sIn{$iCur}) - self::SC0; break; case self::SCU: $iCur++; for(; $iCur < $iInLen - 1; $iCur+=2 ) { $b = ord($sIn{$iCur}); if ($b >= self::UC0 && $b <= self::UC7) { $this->selectedWindow = $b - self::UC0; break 2; } else if ($b >= self::UD0 && $b <= self::UD7) { $this->defineWindow($b - self::UD0, ord($sIn{$iCur + 1})); $iCur++; break 2; } else if ($b == self::UDX) { $this->defineExtendedWindow($this->charFromTwoBytes(ord($sIn{$iCur+1}), ord($sIn{$iCur+2}))); $iCur += 2; break 2; } else if ($b == self::UQU) $iCur++; $ch = $this->charFromTwoBytes(ord($sIn{$iCur}), ord($sIn{$iCur+1})); $sOut .= UTF8::codepointToStr($ch); } if( $iCur != $iInLen) throw new SCSU_Exception('', SCSU_Exception::INPUT_ENDED); break; case self::SQU: $iCur += 2; if( $iCur >= $iInLen) { break 2; } else { $ch = $this->charFromTwoBytes(ord($sIn{$iCur-1}), ord($sIn{$iCur})); $sOut .= UTF8::codepointToStr($ch); } break; case self::Srs: throw new SCSU_Exception('Pos. ' . $iCur . '.', SCSU_Exception::INPUT_SRS); } } if( $iCur < $iInLen) throw new SCSU_Exception('', SCSU_Exception::INPUT_ENDED); return $sOut; } // private methods // common private methods private function isCompressible($ch) { return ($ch < 0x3400 || $ch >= 0xE000); } private function reset() { $this->selectedWindow = 0; $this->dynamicOffset = array(); array_push( $this->dynamicOffset, self::$initialDynamicOffset[0] ); array_push( $this->dynamicOffset, self::$initialDynamicOffset[1] ); array_push( $this->dynamicOffset, self::$initialDynamicOffset[2] ); array_push( $this->dynamicOffset, self::$initialDynamicOffset[3] ); array_push( $this->dynamicOffset, self::$initialDynamicOffset[4] ); array_push( $this->dynamicOffset, self::$initialDynamicOffset[5] ); array_push( $this->dynamicOffset, self::$initialDynamicOffset[6] ); array_push( $this->dynamicOffset, self::$initialDynamicOffset[7] ); } private function charFromTwoBytes($hi, $lo) { $ch = ($lo >= 0 ? $lo : 256 + $lo); return ($ch + (($hi >= 0 ? $hi : 256 + $hi)<<8)); } // compression related private methods private function compress_part() { $this->iInLen = count($this->aIn); $this->iIn = 0; $ch = 0; while ($this->iIn < $this->iInLen) { if ($this->iSCU != -1) { $ch = $this->outputUnicodeRun(); if (strlen($this->sOut) - $this->iSCU == 3 ) { $this->sOut{$this->iSCU} = chr(self::SQU); $this->iSCU = -1; continue; } else { $this->iSCU = -1; $this->fUnicodeMode = true; } } else $ch = $this->outputSingleByteRun($this->aIn); if ($this->iIn == $this->iInLen) break; for ($ich = $this->iIn; $ch < 0x80; $ich++) { if ($ich == $this->iInLen || !$this->isCompressible($this->aIn[$ich])) { $ch = $this->aIn[$this->iIn]; break; } $ch = $this->aIn[$ich]; } $iprevWindow = $this->selectedWindow; if ($ch < 0x80 || $this->locateWindow($ch, $this->dynamicOffset)) { if(!$this->fUnicodeMode && $this->iIn < $this->iInLen - 1) { $ch2 = $this->aIn[$this->iIn + 1]; if ($ch2 >= $this->dynamicOffset[$iprevWindow] && $ch2 < $this->dynamicOffset[$iprevWindow] + 0x80) { $this->quoteSingleByte($ch); $this->selectedWindow = $iprevWindow; continue; } } $this->sOut .= chr(($this->fUnicodeMode ? self::UC0 : self::SC0) + $this->selectedWindow); $this->fUnicodeMode = false; } else if (!$this->fUnicodeMode && $this->locateWindow($ch, self::$staticOffset)) { $this->quoteSingleByte($ch); $this->selectedWindow = $iprevWindow; continue; } else if ($this->positionWindow($ch) ) { $this->fUnicodeMode = false; } else { $this->iSCU = strlen($this->sOut); $this->sOut .= chr(self::SCU); continue; } } } private function locateWindow($ch, &$offsetTable){ $iWin = $this->selectedWindow; if ($iWin != - 1 && $ch >= $offsetTable[$iWin] && $ch < $offsetTable[$iWin] + 0x80) return true; $offsetTableLen = count($offsetTable); for ($iWin = 0; $iWin < $offsetTableLen; $iWin++) { if ($ch >= $offsetTable[$iWin] && $ch < $offsetTable[$iWin] + 0x80) { $this->selectedWindow = $iWin; return true; } } return false; } private function isAsciiCrLfOrTab($ch){ return ($ch >= 0x20 && $ch <= 0x7F) || $ch == 0x09 || $ch == 0x0A || $ch == 0x0D; } private function outputSingleByteRun(){ $iWin = $this->selectedWindow; while($this->iIn < $this->iInLen){ $this->iOutLen = 0; $byte1 = 0; $byte2 = 0; $ch = $this->aIn[$this->iIn]; $inlen = 1; if ( ($ch & 0xF800) == 0xD800 ){ if ( ($ch & 0xFC00) == 0xDC00 ){ throw new SCSU_Exception('Byte #' . $this->iIn . '.', SCSU_Exception::INPUT_UNP_LOW); } else { if ( $this->iIn >= $this->iInLen - 1) throw new SCSU_Exception('', SCSU_Exception::INPUT_ENDED); $ch2 = $this->aIn[$this->iIn + 1]; if ( ($ch2 & 0xFC00) != 0xDC00 ) throw new SCSU_Exception('Byte #' . ($this->iIn + 1) . '.', SCSU_Exception::INPUT_UNP_HIGH); $ch = (($ch - 0xD800)<<10 | ($ch2-0xDC00)) + 0x10000; $inlen = 2; } } if ($this->isAsciiCrLfOrTab($ch) || $ch == 0){ $byte2 = $ch & 0x7F; $this->iOutLen = 1; } else if ($ch < 0x20) { $byte1 = self::SQ0; $byte2 = $ch & 255; $this->iOutLen = 2; } else if ($ch >= $this->dynamicOffset[$iWin] && $ch < $this->dynamicOffset[$iWin] + 0x80) { $ch -= $this->dynamicOffset[$iWin]; $byte2 = $ch | 0x80; $this->iOutLen = 1; } switch($this->iOutLen) { default: return $ch; case 2: $this->sOut .= chr($byte1); case 1: $this->sOut .= chr($byte2); break; } $this->iIn += $inlen; } return 0; } private function quoteSingleByte($ch) { $iWin = $this->selectedWindow; $this->sOut .= chr(( self::SQ0 + $iWin ) & 255); if ($ch >= $this->dynamicOffset[$iWin] && $ch < $this->dynamicOffset[$iWin] + 0x80) { $ch -= $this->dynamicOffset[$iWin]; $this->sOut .= chr($ch | 0x80); } else if ($ch >= self::$staticOffset[$iWin] && $ch < self::$staticOffset[$iWin] + 0x80) { $ch -= self::$staticOffset[$iWin]; $this->sOut .= chr(( $ch ) & 255); } else throw new SCSU_Exception('ch = ' . $ch . ' not valid in quoteSingleByte.'); $this->iIn++; } private function outputUnicodeRun() { $ch = 0; while($this->iIn < $this->iInLen) { $ch = $this->aIn[$this->iIn]; $this->iOutLen = 2; if ($this->isCompressible($ch)) { if( $this->iIn < $this->iInLen - 1) { $ch2 = $this->aIn[$this->iIn + 1]; if ($this->isCompressible($ch2)) break; } if ($ch >= 0xE000 && $ch <= 0xF2FF) $this->iOutLen = 3; } if ($this->iOutLen == 3) $this->sOut .= chr(self::UQU); $this->sOut .= chr($ch >> 8); $this->sOut .= chr($ch & 0xFF); $this->iIn++; } return $ch; } private function positionWindow($ch) { $iWin = $this->iNextWindow % 8; $iPosition = 0; if ($ch < 0x80) throw new SCSU_Exception('ch < 0x80.'); for ($i = 0; $i < count(self::$fixedOffset); $i++) { if ($ch >= self::$fixedOffset[$i] && $ch < self::$fixedOffset[$i] + 0x80) { $iPosition = $i; break; } } if ($iPosition != 0) { $this->dynamicOffset[$iWin] = self::$fixedOffset[$iPosition]; $iPosition += 0xF9; } else if ($ch < 0x3400) { $iPosition = $ch >> 7; $this->dynamicOffset[$iWin] = $ch & 0xFF80; } else if ($ch < 0xE000) { return false; } else if ($ch <= 0xFFFF) { $iPosition = (($ch - self::gapOffset) >> 7); $this->dynamicOffset[$iWin] = $ch & 0xFF80; } else { $iPosition = ($ch - 0x10000) >> 7; $iPosition |= $iWin << 13; $this->dynamicOffset[$iWin] = $ch & 0x1FFF80; } if ( $iPosition < 0x100) { $this->sOut .= chr(($this->fUnicodeMode ? self::UD0 : self::SD0) + $iWin); $this->sOut .= chr($iPosition & 0xFF); } else if ( $iPosition >= 0x100 ) { $this->sOut .= chr($this->fUnicodeMode ? self::UDX : self::SDX); $this->sOut .= chr(($iPosition >> 8) & 0xFF); $this->sOut .= chr($iPosition & 0xFF); } $this->selectedWindow = $iWin; $this->iNextWindow++; return true; } // decompression related private methods private function defineWindow($iWindow, $bOffset) { $iOffset = ($bOffset < 0 ? $bOffset + 256 : $bOffset); if ($iOffset == 0) { throw new SCSU_Exception('', SCSU_Exception::INPUT_OFF_ZERO); } else if ($iOffset < self::gapThreshold) { $this->dynamicOffset[$iWindow] = $iOffset << 7; } else if ($iOffset < self::reservedStart) { $this->dynamicOffset[$iWindow] = ($iOffset << 7) + self::gapOffset; } else if ($iOffset < self::fixedThreshold) { throw new SCSU_Exception('Value = ' . $iOffset . '.', SCSU_Exception::INPUT_OFF_BAD); } else { $this->dynamicOffset[$iWindow] = self::$fixedOffset[$iOffset - self::fixedThreshold]; } $this->selectedWindow = $iWindow; } private function defineExtendedWindow($chOffset) { $iWindow = $chOffset >> 13; $this->dynamicOffset[$iWindow] = (($chOffset & 0x1FFF) << 7) + (1 << 16); $this->selectedWindow = $iWindow; } } /** * Provides exceptions of SCSU errors * * See SCSU.php File description for full information * * @author Alexey A.Znaev <[email protected]> <http://xbsoft.org> * @link http://xbsoft.org * @package SCSU * @version 1.0 * @since 1.0 */ class SCSU_Exception extends Exception { const INTERNAL = 0x00; const INPUT = 0x10; const INPUT_ENDED = 0x11; const INPUT_UNP_LOW = 0x12; const INPUT_UNP_HIGH = 0x13; const INPUT_OFF_ZERO = 0x14; const INPUT_OFF_BAD = 0x15; const INPUT_SRS = 0x16; const OUTPUT = 0x20; private static $Messages = array( self::INTERNAL => 'Internal error.', self::INPUT => 'Illegal input.', self::INPUT_ENDED => 'Ended prematurely.', self::INPUT_UNP_LOW => 'Unpaired low surrogate.', self::INPUT_UNP_HIGH => 'Unpaired high surrogate.', self::INPUT_OFF_ZERO => 'Zero offset.', self::INPUT_OFF_BAD => 'Bad offset.', self::INPUT_SRS => 'Srs byte found.', self::OUTPUT => 'Bad output.', ); public function __construct($message = '', $code = 0x00, Exception $previous = null) { $message_prefix = ''; $code_class = $code & 0xF0; if(array_key_exists($code_class, self::$Messages)) $message_prefix = self::$Messages[$code_class]; if(($code != $code_class) && array_key_exists($code, self::$Messages)) $message_prefix .= ' ' . self::$Messages[$code]; if(!empty($message_prefix)) $message = $message_prefix . ' ' . $message; parent::__construct('SCSU: ' . $message, $code, $previous); } }

Системные требования скрипта:

PHP не младше 5.3 версии.

Скачать архивы

Алексей Знаев

2507

Комментировать

rss