diff --git a/lib/Unishox-1.0-shadinger/python/unishox.py b/lib/Unishox-1.0-shadinger/python/unishox.py index bc8fc97e6..feafe0500 100644 --- a/lib/Unishox-1.0-shadinger/python/unishox.py +++ b/lib/Unishox-1.0-shadinger/python/unishox.py @@ -1,496 +1,513 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Python Class for compressing short strings. + +This class contains a highly modified and optimized version of Unishox +for Tasmota converted in C ported to Pyhton3. + +It was basically developed to individually compress and decompress small strings +(see https://github.com/siara-cc/Unishox) +In general compression utilities such as zip, gzip do not compress short strings +well and often expand them. They also use lots of memory which makes them unusable +in constrained environments like Arduino. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +class Unishox: + """ + This is a highly modified and optimized version of Unishox + for Tasmota, aimed at compressing `Rules` which are typically + short strings from 50 to 500 bytes. + + @author Stephan Hadinger + @revised Norbert Richter + """ + + # pylint: disable=bad-continuation,bad-whitespace,line-too-long + cl_95 = [0x4000 + 3, 0x3F80 + 11, 0x3D80 + 11, 0x3C80 + 10, 0x3BE0 + 12, 0x3E80 + 10, 0x3F40 + 11, 0x3EC0 + 10, 0x3BA0 + 11, 0x3BC0 + 11, 0x3D60 + 11, 0x3B60 + 11, 0x3A80 + 10, 0x3AC0 + 10, 0x3A00 + 9, 0x3B00 + 10, 0x38C0 + 10, 0x3900 + 10, 0x3940 + 11, 0x3960 + 11, 0x3980 + 11, 0x39A0 + 11, 0x39C0 + 11, 0x39E0 + 12, 0x39F0 + 12, 0x3880 + 10, 0x3CC0 + 10, 0x3C00 + 9, 0x3D00 + 10, 0x3E00 + 9, 0x3F00 + 10, 0x3B40 + 11, 0x3BF0 + 12, 0x2B00 + 8, 0x21C0 + 11, 0x20C0 + 10, 0x2100 + 10, 0x2600 + 7, 0x2300 + 11, 0x21E0 + 12, 0x2140 + 11, 0x2D00 + 8, 0x2358 + 13, 0x2340 + 12, 0x2080 + 10, 0x21A0 + 11, 0x2E00 + 8, 0x2C00 + 8, 0x2180 + 11, 0x2350 + 13, 0x2F80 + 9, 0x2F00 + 9, 0x2A00 + 8, 0x2160 + 11, 0x2330 + 12, 0x21F0 + 12, 0x2360 + 13, 0x2320 + 12, 0x2368 + 13, 0x3DE0 + 12, 0x3FA0 + 11, 0x3DF0 + 12, 0x3D40 + 11, 0x3F60 + 11, 0x3FF0 + 12, 0xB000 + 4, 0x1C00 + 7, 0x0C00 + 6, 0x1000 + 6, 0x6000 + 3, 0x3000 + 7, 0x1E00 + 8, 0x1400 + 7, 0xD000 + 4, 0x3580 + 9, 0x3400 + 8, 0x0800 + 6, 0x1A00 + 7, 0xE000 + 4, 0xC000 + 4, 0x1800 + 7, 0x3500 + 9, 0xF800 + 5, 0xF000 + 5, 0xA000 + 4, 0x1600 + 7, 0x3300 + 8, 0x1F00 + 8, 0x3600 + 9, 0x3200 + 8, 0x3680 + 9, 0x3DA0 + 11, 0x3FC0 + 11, 0x3DC0 + 11, 0x3FE0 + 12] + + # enum {SHX_STATE_1 = 1, SHX_STATE_2}; // removed Unicode state + SHX_STATE_1 = 1 + SHX_STATE_2 = 2 + + SHX_SET1 = 0 + SHX_SET1A = 1 + SHX_SET1B = 2 + SHX_SET2 = 3 + + sets = [[0, ' ', 'e', 0, 't', 'a', 'o', 'i', 'n', 's', 'r'], + [0, 'l', 'c', 'd', 'h', 'u', 'p', 'm', 'b', 'g', 'w'], + ['f', 'y', 'v', 'k', 'q', 'j', 'x', 'z', 0, 0, 0], + [0, '9', '0', '1', '2', '3', '4', '5', '6', '7', '8'], + ['.', ',', '-', '/', '?', '+', ' ', '(', ')', '$', '@'], + [';', '#', ':', '<', '^', '*', '"', '{', '}', '[', ']'], + ['=', '%', '\'', '>', '&', '_', '!', '\\', '|', '~', '`']] + + us_vcode = [2 + (0 << 3), 3 + (3 << 3), 3 + (1 << 3), 4 + (6 << 3), 0, + # 5, 6, 7, 8, 9, 10 + 4 + (4 << 3), 3 + (2 << 3), 4 + (8 << 3), 0, 0, 0, + # 11, 12, 13, 14, 15 + 4 + (7 << 3), 0, 4 + (5 << 3), 0, 5 + (9 << 3), + # 16, 17, 18, 19, 20, 21, 22, 23 + 0, 0, 0, 0, 0, 0, 0, 0, + # 24, 25, 26, 27, 28, 29, 30, 31 + 0, 0, 0, 0, 0, 0, 0, 5 + (10 << 3) ] + # 0, 1, 2, 3, 4, 5, 6, 7, + us_hcode = [1 + (1 << 3), 2 + (0 << 3), 0, 3 + (2 << 3), 0, 0, 0, 5 + (3 << 3), + # 8, 9, 10, 11, 12, 13, 14, 15, + 0, 0, 0, 0, 0, 0, 0, 5 + (5 << 3), + # 16, 17, 18, 19, 20, 21, 22, 23 + 0, 0, 0, 0, 0, 0, 0, 5 + (4 << 3), + # 24, 25, 26, 27, 28, 29, 30, 31 + 0, 0, 0, 0, 0, 0, 0, 5 + (6 << 3) ] + # pylint: enable=bad-continuation,bad-whitespace + + ESCAPE_MARKER = 0x2A + + TERM_CODE = 0x37C0 + # TERM_CODE_LEN = 10 + DICT_CODE = 0x0000 + DICT_CODE_LEN = 5 + #DICT_OTHER_CODE = 0x0000 + #DICT_OTHER_CODE_LEN = 6 + RPT_CODE_TASMOTA = 0x3780 + RPT_CODE_TASMOTA_LEN = 10 + BACK2_STATE1_CODE = 0x2000 + BACK2_STATE1_CODE_LEN = 4 + #BACK_FROM_UNI_CODE = 0xFE00 + #BACK_FROM_UNI_CODE_LEN = 8 + LF_CODE = 0x3700 + LF_CODE_LEN = 9 + TAB_CODE = 0x2400 + TAB_CODE_LEN = 7 + ALL_UPPER_CODE = 0x2200 + ALL_UPPER_CODE_LEN = 8 + SW2_STATE2_CODE = 0x3800 + SW2_STATE2_CODE_LEN = 7 + ST2_SPC_CODE = 0x3B80 + ST2_SPC_CODE_LEN = 11 + BIN_CODE_TASMOTA = 0x8000 + BIN_CODE_TASMOTA_LEN = 3 + + NICE_LEN = 5 + + mask = [0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF] + + # pylint: disable=missing-function-docstring,invalid-name + + # Input + # out = bytearray + def append_bits(self, out, ol, code, clen, state): + #print("Append bits {ol} {code} {clen} {state}".format(ol=ol, code=code, clen=clen, state=state)) + if state == self.SHX_STATE_2: + # remove change state prefix + if (code >> 9) == 0x1C: + code <<= 7 + clen -= 7 + while clen > 0: + cur_bit = ol % 8 + blen = 8 if (clen > 8) else clen + a_byte = (code >> 8) & self.mask[blen - 1] + #print("append_bits a_byte {ab} blen {blen}".format(ab=a_byte,blen=blen)) + a_byte >>= cur_bit + if blen + cur_bit > 8: + blen = (8 - cur_bit) + if cur_bit == 0: + out[ol // 8] = a_byte + else: + out[ol // 8] |= a_byte + code <<= blen + ol += blen + if 0 == ol % 8: # pylint: disable=misplaced-comparison-constant + # we completed a full byte + last_c = out[(ol // 8) - 1] + if last_c in (0, self.ESCAPE_MARKER): + out[ol // 8] = 1 + last_c # increment to 0x01 or 0x2B + out[(ol // 8) -1] = self.ESCAPE_MARKER # replace old value with marker + ol += 8 # add one full byte + clen -= blen + return ol + + codes = [0x82, 0xC3, 0xE5, 0xED, 0xF5] # pylint: disable=bad-whitespace + bit_len = [ 5, 7, 9, 12, 16] # pylint: disable=bad-whitespace + + def encodeCount(self, out, ol, count): + #print("encodeCount ol = {ol}, count = {count}".format(ol=ol, count=count)) + till = 0 + base = 0 + for i in range(len(self.bit_len)): + bit_len_i = self.bit_len[i] + till += (1 << bit_len_i) + if count < till: + codes_i = self.codes[i] + ol = self.append_bits(out, ol, (codes_i & 0xF8) << 8, codes_i & 0x07, 1) + #print("encodeCount append_bits ol = {ol}, code = {code}, len = {len}".format(ol=ol,code=(codes_i & 0xF8) << 8,len=codes_i & 0x07)) + ol = self.append_bits(out, ol, (count - base) << (16 - bit_len_i), bit_len_i, 1) + #print("encodeCount append_bits ol = {ol}, code = {code}, len = {len}".format(ol=ol,code=(count - base) << (16 - bit_len_i),len=bit_len_i)) + return ol + base = till + return ol + + # Returns (int, ol, state, is_all_upper) + def matchOccurance(self, inn, len_, l_, out, ol, state, is_all_upper): + # int j, k; + longest_dist = 0 + longest_len = 0 + #for (j = l_ - self.NICE_LEN; j >= 0; j--) { + j = l_ - self.NICE_LEN + while j >= 0: + k = l_ + #for (k = l_; k < len && j + k - l_ < l_; k++) { + while k < len_ and j + k - l_ < l_: + if inn[k] != inn[j + k - l_]: + break + k += 1 + if k - l_ > self.NICE_LEN - 1: + match_len = k - l_ - self.NICE_LEN + match_dist = l_ - j - self.NICE_LEN + 1 + if match_len > longest_len: + longest_len = match_len + longest_dist = match_dist + j -= 1 + + if longest_len: + #print("longest_len {ll}".format(ll=longest_len)) + #ol_save = ol + if state == self.SHX_STATE_2 or is_all_upper: + is_all_upper = 0 + state = self.SHX_STATE_1 + ol = self.append_bits(out, ol, self.BACK2_STATE1_CODE, self.BACK2_STATE1_CODE_LEN, state) + + ol = self.append_bits(out, ol, self.DICT_CODE, self.DICT_CODE_LEN, 1) + ol = self.encodeCount(out, ol, longest_len) + ol = self.encodeCount(out, ol, longest_dist) + #print("longest_len {ll} longest_dist {ld} ol {ols}-{ol}".format(ll=longest_len, ld=longest_dist, ol=ol, ols=ol_save)) + l_ += longest_len + self.NICE_LEN + l_ -= 1 + + return l_, ol, state, is_all_upper + return -l_, ol, state, is_all_upper - - -cl_95 = [ 0x4000 + 3, 0x3F80 + 11, 0x3D80 + 11, 0x3C80 + 10, 0x3BE0 + 12, 0x3E80 + 10, 0x3F40 + 11, 0x3EC0 + 10, 0x3BA0 + 11, 0x3BC0 + 11, 0x3D60 + 11, 0x3B60 + 11, 0x3A80 + 10, 0x3AC0 + 10, 0x3A00 + 9, 0x3B00 + 10, 0x38C0 + 10, 0x3900 + 10, 0x3940 + 11, 0x3960 + 11, 0x3980 + 11, 0x39A0 + 11, 0x39C0 + 11, 0x39E0 + 12, 0x39F0 + 12, 0x3880 + 10, 0x3CC0 + 10, 0x3C00 + 9, 0x3D00 + 10, 0x3E00 + 9, 0x3F00 + 10, 0x3B40 + 11, 0x3BF0 + 12, 0x2B00 + 8, 0x21C0 + 11, 0x20C0 + 10, 0x2100 + 10, 0x2600 + 7, 0x2300 + 11, 0x21E0 + 12, 0x2140 + 11, 0x2D00 + 8, 0x2358 + 13, 0x2340 + 12, 0x2080 + 10, 0x21A0 + 11, 0x2E00 + 8, 0x2C00 + 8, 0x2180 + 11, 0x2350 + 13, 0x2F80 + 9, 0x2F00 + 9, 0x2A00 + 8, 0x2160 + 11, 0x2330 + 12, 0x21F0 + 12, 0x2360 + 13, 0x2320 + 12, 0x2368 + 13, 0x3DE0 + 12, 0x3FA0 + 11, 0x3DF0 + 12, 0x3D40 + 11, 0x3F60 + 11, 0x3FF0 + 12, 0xB000 + 4, 0x1C00 + 7, 0x0C00 + 6, 0x1000 + 6, 0x6000 + 3, 0x3000 + 7, 0x1E00 + 8, 0x1400 + 7, 0xD000 + 4, 0x3580 + 9, 0x3400 + 8, 0x0800 + 6, 0x1A00 + 7, 0xE000 + 4, 0xC000 + 4, 0x1800 + 7, 0x3500 + 9, 0xF800 + 5, 0xF000 + 5, 0xA000 + 4, 0x1600 + 7, 0x3300 + 8, 0x1F00 + 8, 0x3600 + 9, 0x3200 + 8, 0x3680 + 9, 0x3DA0 + 11, 0x3FC0 + 11, 0x3DC0 + 11, 0x3FE0 + 12 ] - - -# enum {SHX_STATE_1 = 1, SHX_STATE_2}; // removed Unicode state -SHX_STATE_1 = 1 -SHX_STATE_2 = 2 - - -SHX_SET1 = 0 -SHX_SET1A = 1 -SHX_SET1B = 2 -SHX_SET2 = 3 -SHX_SET3 = 4 -SHX_SET4 = 5 -SHX_SET4A = 6 - -# char sets[][11] PROGMEM = -# {{ 0, ' ', 'e', 0, 't', 'a', 'o', 'i', 'n', 's', 'r'}, -# { 0, 'l', 'c', 'd', 'h', 'u', 'p', 'm', 'b', 'g', 'w'}, -# {'f', 'y', 'v', 'k', 'q', 'j', 'x', 'z', 0, 0, 0}, -# { 0, '9', '0', '1', '2', '3', '4', '5', '6', '7', '8'}, -# {'.', ',', '-', '/', '?', '+', ' ', '(', ')', '$', '@'}, -# {';', '#', ':', '<', '^', '*', '"', '{', '}', '[', ']'}, -# {'=', '%', '\'', '>', '&', '_', '!', '\\', '|', '~', '`'}}; -sets = [[ 0, ' ', 'e', 0, 't', 'a', 'o', 'i', 'n', 's', 'r'], - [ 0, 'l', 'c', 'd', 'h', 'u', 'p', 'm', 'b', 'g', 'w'], - ['f', 'y', 'v', 'k', 'q', 'j', 'x', 'z', 0, 0, 0], - [ 0, '9', '0', '1', '2', '3', '4', '5', '6', '7', '8'], - ['.', ',', '-', '/', '?', '+', ' ', '(', ')', '$', '@'], - [';', '#', ':', '<', '^', '*', '"', '{', '}', '[', ']'], - ['=', '%', '\'', '>', '&', '_', '!', '\\', '|', '~', '`']] - - -us_vcode = [2 + (0 << 3), 3 + (3 << 3), 3 + (1 << 3), 4 + (6 << 3), 0, -# 5, 6, 7, 8, 9, 10 - 4 + (4 << 3), 3 + (2 << 3), 4 + (8 << 3), 0, 0, 0, -# 11, 12, 13, 14, 15 - 4 + (7 << 3), 0, 4 + (5 << 3), 0, 5 + (9 << 3), -# 16, 17, 18, 19, 20, 21, 22, 23 - 0, 0, 0, 0, 0, 0, 0, 0, -# 24, 25, 26, 27, 28, 29, 30, 31 - 0, 0, 0, 0, 0, 0, 0, 5 + (10 << 3) ] -# 0, 1, 2, 3, 4, 5, 6, 7, -us_hcode = [1 + (1 << 3), 2 + (0 << 3), 0, 3 + (2 << 3), 0, 0, 0, 5 + (3 << 3), -# 8, 9, 10, 11, 12, 13, 14, 15, - 0, 0, 0, 0, 0, 0, 0, 5 + (5 << 3), -# 16, 17, 18, 19, 20, 21, 22, 23 - 0, 0, 0, 0, 0, 0, 0, 5 + (4 << 3), -# 24, 25, 26, 27, 28, 29, 30, 31 - 0, 0, 0, 0, 0, 0, 0, 5 + (6 << 3) ] - -ESCAPE_MARKER = 0x2A - -TERM_CODE = 0x37C0 -TERM_CODE_LEN = 10 -DICT_CODE = 0x0000 -DICT_CODE_LEN = 5 -DICT_OTHER_CODE = 0x0000 -DICT_OTHER_CODE_LEN = 6 -RPT_CODE_TASMOTA = 0x3780 -RPT_CODE_TASMOTA_LEN = 10 -BACK2_STATE1_CODE = 0x2000 -BACK2_STATE1_CODE_LEN = 4 -BACK_FROM_UNI_CODE = 0xFE00 -BACK_FROM_UNI_CODE_LEN = 8 -LF_CODE = 0x3700 -LF_CODE_LEN = 9 -TAB_CODE = 0x2400 -TAB_CODE_LEN = 7 -ALL_UPPER_CODE = 0x2200 -ALL_UPPER_CODE_LEN = 8 -SW2_STATE2_CODE = 0x3800 -SW2_STATE2_CODE_LEN = 7 -ST2_SPC_CODE = 0x3B80 -ST2_SPC_CODE_LEN = 11 -BIN_CODE_TASMOTA = 0x8000 -BIN_CODE_TASMOTA_LEN = 3 - -NICE_LEN = 5 - -mask = [ 0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF ] - -# Input -# out = bytearray -def append_bits(out, ol, code, clen, state): - #print("Append bits {ol} {code} {clen} {state}".format(ol=ol, code=code, clen=clen, state=state)) - if (state == SHX_STATE_2): - # remove change state prefix - if ((code >> 9) == 0x1C): - code <<= 7 - clen -= 7 - while (clen > 0): - cur_bit = ol % 8 - blen = 8 if (clen > 8) else clen - a_byte = (code >> 8) & mask[blen - 1] - #print("append_bits a_byte {ab} blen {blen}".format(ab=a_byte,blen=blen)) - a_byte >>= cur_bit - if (blen + cur_bit > 8): - blen = (8 - cur_bit) - if (cur_bit == 0): - out[ol // 8] = a_byte - else: - out[ol // 8] |= a_byte - code <<= blen - ol += blen - if (0 == ol % 8): - # we completed a full byte - last_c = out[(ol // 8) - 1] - if ((0 == last_c) or (ESCAPE_MARKER == last_c)): - out[ol // 8] = 1 + last_c # increment to 0x01 or 0x2B - out[(ol // 8) -1] = ESCAPE_MARKER # replace old value with marker - ol += 8 # add one full byte - clen -= blen; - return ol - - -codes = [ 0x82, 0xC3, 0xE5, 0xED, 0xF5 ] -bit_len = [ 5, 7, 9, 12, 16 ] - -def encodeCount(out, ol, count): - #print("encodeCount ol = {ol}, count = {count}".format(ol=ol, count=count)) - till = 0 - base = 0 - for i in range(len(bit_len)): - bit_len_i = bit_len[i] - till += (1 << bit_len_i) - if (count < till): - codes_i = codes[i] - ol = append_bits(out, ol, (codes_i & 0xF8) << 8, codes_i & 0x07, 1) - #print("encodeCount append_bits ol = {ol}, code = {code}, len = {len}".format(ol=ol,code=(codes_i & 0xF8) << 8,len=codes_i & 0x07)) - ol = append_bits(out, ol, (count - base) << (16 - bit_len_i), bit_len_i, 1) - #print("encodeCount append_bits ol = {ol}, code = {code}, len = {len}".format(ol=ol,code=(count - base) << (16 - bit_len_i),len=bit_len_i)) - return ol - base = till - return ol - -# Returns (int, ol, state, is_all_upper) -def matchOccurance(inn, lenn, l, out, ol, state, is_all_upper): - -#int matchOccurance(const char *in, int len, int l, char *out, int *ol, byte *state, byte *is_all_upper) { - # int j, k; - longest_dist = 0 - longest_len = 0 - #for (j = l - NICE_LEN; j >= 0; j--) { - j = l - NICE_LEN - while (j >= 0): - - k = l - #for (k = l; k < len && j + k - l < l; k++) { - while ((k < lenn) and (j + k - l < l)): - if (inn[k] != inn[j + k - l]): - break - k += 1 - if (k - l > NICE_LEN - 1): - match_len = k - l - NICE_LEN - match_dist = l - j - NICE_LEN + 1 - if (match_len > longest_len): - longest_len = match_len - longest_dist = match_dist - j -= 1 - - if (longest_len): - #print("longest_len {ll}".format(ll=longest_len)) - ol_save = ol - if (state == SHX_STATE_2 or is_all_upper): - is_all_upper = 0 - state = SHX_STATE_1 - ol = append_bits(out, ol, BACK2_STATE1_CODE, BACK2_STATE1_CODE_LEN, state) - - ol = append_bits(out, ol, DICT_CODE, DICT_CODE_LEN, 1) - ol = encodeCount(out, ol, longest_len) - ol = encodeCount(out, ol, longest_dist) - #print("longest_len {ll} longest_dist {ld} ol {ols}-{ol}".format(ll=longest_len, ld=longest_dist, ol=ol, ols=ol_save)) - l += longest_len + NICE_LEN - l -= 1 - - return l, ol, state, is_all_upper - return -l, ol, state, is_all_upper - - -def unishox_compress(inn, len, out, len_out): -# int32_t unishox_compress(const char *in, size_t len, char *out, size_t len_out) { - - ol = 0 - state = SHX_STATE_1 - is_all_upper = 0 - l = 0 - while (l < len): - # for (l=0; l 0): - #print("matchOccurance l = {l} l_old = {lo}".format(l=l,lo=l_old)) - l += 1 # for loop - continue - - l = -l - - if (state == SHX_STATE_2): # if Set2 - if ((c_in >= ord(' ') and c_in <= ord('@')) or (c_in >= ord('[') and c_in <= ord('`')) or (c_in >= ord('{') and c_in <= ord('~'))): - pass - else: - state = SHX_STATE_1 # back to Set1 and lower case - ol = append_bits(out, ol, BACK2_STATE1_CODE, BACK2_STATE1_CODE_LEN, state) - - is_upper = 0 - if (c_in >= ord('A') and c_in <= ord('Z')): - is_upper = 1 - else: - if (is_all_upper): + def compress(self, inn, len_, out, len_out): + ol = 0 + state = self.SHX_STATE_1 is_all_upper = 0 - ol = append_bits(out, ol, BACK2_STATE1_CODE, BACK2_STATE1_CODE_LEN, state) + l = 0 + while l < len_: + # for (l=0; l= 32 and c_in <= 126): - if (is_upper and not is_all_upper): - ll=l+5 - # for (ll=l+5; ll>=l && ll=l and ll ord('Z')): - break + if l and l < len_ - 4: + if c_in == inn[l - 1] and c_in == inn[l + 1] and c_in == inn[l + 2] and c_in == inn[l + 3]: + rpt_count = l + 4 + while rpt_count < len_ and inn[rpt_count] == c_in: + rpt_count += 1 + rpt_count -= l - ll -= 1 + if state == self.SHX_STATE_2 or is_all_upper: + is_all_upper = 0 + state = self.SHX_STATE_1 + ol = self.append_bits(out, ol, self.BACK2_STATE1_CODE, self.BACK2_STATE1_CODE_LEN, state) # back to lower case and Set1 - if (ll == l-1): - ol = append_bits(out, ol, ALL_UPPER_CODE, ALL_UPPER_CODE_LEN, state) # CapsLock - is_all_upper = 1; + ol = self.append_bits(out, ol, self.RPT_CODE_TASMOTA, self.RPT_CODE_TASMOTA_LEN, 1) # reusing CRLF for RPT + ol = self.encodeCount(out, ol, rpt_count - 4) + l += rpt_count + #l -= 1 + continue - if (state == SHX_STATE_1 and c_in >= ord('0') and c_in <= ord('9')): - ol = append_bits(out, ol, SW2_STATE2_CODE, SW2_STATE2_CODE_LEN, state) # Switch to sticky Set2 - state = SHX_STATE_2 + if l < (len_ - self.NICE_LEN + 1): + #l_old = l + (l, ol, state, is_all_upper) = self.matchOccurance(inn, len_, l, out, ol, state, is_all_upper) + if l > 0: + #print("matchOccurance l = {l} l_old = {lo}".format(l=l,lo=l_old)) + l += 1 # for loop + continue - c_in -= 32 - if (is_all_upper and is_upper): - c_in += 32 - if (c_in == 0 and state == SHX_STATE_2): - ol = append_bits(out, ol, ST2_SPC_CODE, ST2_SPC_CODE_LEN, state) # space from Set2 ionstead of Set1 - else: - # ol = append_bits(out, ol, pgm_read_word(&c_95[c_in]), pgm_read_byte(&l_95[c_in]), state); // original version with c/l in split arrays - cl = cl_95[c_in] - ol = append_bits(out, ol, cl & 0xFFF0, cl & 0x000F, state) + l = -l - #lse: - # // if (c_in == 13 && c_next == 10) { // CRLF disabled - # // ol = append_bits(out, ol, CRLF_CODE, CRLF_CODE_LEN, state); // CRLF - # // l++; - # // } else - elif (c_in == 10): - ol = append_bits(out, ol, LF_CODE, LF_CODE_LEN, state) # LF - elif (c_in == '\t'): - ol = append_bits(out, ol, TAB_CODE, TAB_CODE_LEN, state) # TAB - else: - ol = append_bits(out, ol, BIN_CODE_TASMOTA, BIN_CODE_TASMOTA_LEN, state) # Binary, we reuse the Unicode marker which 3 bits instead of 9 - ol = encodeCount(out, ol, (255 - c_in) & 0xFF) - + if state == self.SHX_STATE_2: # if Set2 + if ord(' ') <= c_in <= ord('@') or ord('[') <= c_in <= ord('`') or ord('{') <= c_in <= ord('~'): + pass + else: + state = self.SHX_STATE_1 # back to Set1 and lower case + ol = self.append_bits(out, ol, self.BACK2_STATE1_CODE, self.BACK2_STATE1_CODE_LEN, state) - # check that we have some headroom in the output buffer - if (ol // 8 >= len_out - 4): - return -1 # we risk overflow and crash - - l += 1 - - bits = ol % 8 - if (bits): - ol = append_bits(out, ol, TERM_CODE, 8 - bits, 1) # 0011 0111 1100 0000 TERM = 0011 0111 11 - return (ol + 7) // 8 - # return ol // 8 + 1 if (ol%8) else 0 - - -def getBitVal(inn, bit_no, count): - c_in = inn[bit_no >> 3] - if ((bit_no >> 3) and (ESCAPE_MARKER == inn[(bit_no >> 3) - 1])): - c_in -= 1 - r = 1 << count if (c_in & (0x80 >> (bit_no % 8))) else 0 - #print("getBitVal r={r}".format(r=r)) - return r - -# Returns: -# 0..11 -# or -1 if end of stream -def getCodeIdx(code_type, inn, len, bit_no_p): - code = 0 - count = 0 - while (count < 5): - # detect marker - if (ESCAPE_MARKER == inn[bit_no_p >> 3]): - bit_no_p += 8 # skip marker - - if (bit_no_p >= len): - return -1, bit_no_p - - code += getBitVal(inn, bit_no_p, count) - bit_no_p += 1 - count += 1 - code_type_code = code_type[code] - if (code_type_code and (code_type_code & 0x07) == count): - #print("getCodeIdx = {r}".format(r=code_type_code >> 3)) - return code_type_code >> 3, bit_no_p - - #print("getCodeIdx not found = {r}".format(r=1)) - return 1, bit_no_p - -def getNumFromBits(inn, bit_no, count): - ret = 0 - while (count): - count -= 1 - if (ESCAPE_MARKER == inn[bit_no >> 3]): - bit_no += 8 # skip marker - ret += getBitVal(inn, bit_no, count) - bit_no += 1 - return ret - -def readCount(inn, bit_no_p, len): - (idx, bit_no_p) = getCodeIdx(us_hcode, inn, len, bit_no_p) - if (idx >= 1): idx -= 1; # we skip v = 1 (code '0') since we no more accept 2 bits encoding - if ((idx >= 5) or (idx < 0)): return 0, bit_no_p # unsupported or end of stream - till = 0 - bit_len_idx = 0 - base = 0 - #for (uint32_t i = 0; i <= idx; i++) { - i = 0 - while (i <= idx): - # for i in range(idx): - base = till - bit_len_idx = bit_len[i] - till += (1 << bit_len_idx) - i += 1 - - count = getNumFromBits(inn, bit_no_p, bit_len_idx) + base - #print("readCount getNumFromBits = {count} ({bl})".format(count=count,bl=bit_len_idx)) - - bit_no_p += bit_len_idx - return count, bit_no_p - - -def decodeRepeat(inn, len, out, ol, bit_no): - #print("decodeRepeat Enter") - (dict_len, bit_no) = readCount(inn, bit_no, len) - dict_len += NICE_LEN - (dist, bit_no) = readCount(inn, bit_no, len) - dist += NICE_LEN - 1 - #memcpy(out + ol, out + ol - dist, dict_len); - i = 0 - while (i < dict_len): - #for i in range(dict_len): - out[ol + i] = out[ol - dist + i] - i += 1 - ol += dict_len - - return ol, bit_no - -def unishox_decompress(inn, len, out, len_out): - ol = 0 - bit_no = 0 - dstate = SHX_SET1 - is_all_upper = 0 - - len <<= 3 # *8, len in bits - out[ol] = 0 - while (bit_no < len): - c = 0 - is_upper = is_all_upper - orig_bit_no = bit_no - (v, bit_no) = getCodeIdx(us_vcode, inn, len, bit_no) # read vCode - #print("bit_no {b}. v = {v}".format(b=bit_no,v=v)) - if (v < 0): break # end of stream - h = dstate # Set1 or Set2 - if (v == 0): # Switch which is common to Set1 and Set2, first entry - (h, bit_no) = getCodeIdx(us_hcode, inn, len, bit_no) # read hCode - #print("bit_no {b}. h = {h}".format(b=bit_no,h=h)) - if (h < 0): break # end of stream - if (h == SHX_SET1): # target is Set1 - if (dstate == SHX_SET1): # Switch from Set1 to Set1 us UpperCase - if (is_all_upper): # if CapsLock, then back to LowerCase is_upper = 0 - is_all_upper = 0 - continue + if ord('A') <= c_in <= ord('Z'): + is_upper = 1 + else: + if is_all_upper: + is_all_upper = 0 + ol = self.append_bits(out, ol, self.BACK2_STATE1_CODE, self.BACK2_STATE1_CODE_LEN, state) - (v, bit_no) = getCodeIdx(us_vcode, inn, len, bit_no) # read again vCode - if (v < 0): break # end of stream - if (v == 0): - (h, bit_no) = getCodeIdx(us_hcode, inn, len, bit_no) # read second hCode - if (h < 0): break # end of stream - if (h == SHX_SET1): # If double Switch Set1, the CapsLock - is_all_upper = 1 - continue + if 32 <= c_in <= 126: + if is_upper and not is_all_upper: + ll = l+5 + # for (ll=l+5; ll>=l && ll ord('Z'): + break - is_upper = 1 # anyways, still uppercase - else: - dstate = SHX_SET1 # if Set was not Set1, switch to Set1 - continue + ll -= 1 - elif (h == SHX_SET2): # If Set2, switch dstate to Set2 - if (dstate == SHX_SET1): # TODO: is this test useful, there are only 2 states possible - dstate = SHX_SET2 - continue + if ll == l-1: + ol = self.append_bits(out, ol, self.ALL_UPPER_CODE, self.ALL_UPPER_CODE_LEN, state) # CapsLock + is_all_upper = 1 - if (h != SHX_SET1): # all other Sets (why not else) - (v, bit_no) = getCodeIdx(us_vcode, inn, len, bit_no) # we changed set, now read vCode for char - if (v < 0): break # end of stream + if state == self.SHX_STATE_1 and ord('0') <= c_in <= ord('9'): + ol = self.append_bits(out, ol, self.SW2_STATE2_CODE, self.SW2_STATE2_CODE_LEN, state) # Switch to sticky Set2 + state = self.SHX_STATE_2 - if (v == 0 and h == SHX_SET1A): - #print("v = 0, h = SHX_SET1A") - if (is_upper): - (temp, bit_no) = readCount(inn, bit_no, len) - out[ol] = 255 - temp # binary - ol += 1 - else: - (ol, bit_no) = decodeRepeat(inn, len, out, ol, bit_no) # dist - continue + c_in -= 32 + if is_all_upper and is_upper: + c_in += 32 + if c_in == 0 and state == self.SHX_STATE_2: + ol = self.append_bits(out, ol, self.ST2_SPC_CODE, self.ST2_SPC_CODE_LEN, state) # space from Set2 ionstead of Set1 + else: + # ol = self.append_bits(out, ol, pgm_read_word(&c_95[c_in]), pgm_read_byte(&l_95[c_in]), state); // original version with c/l in split arrays + cl = self.cl_95[c_in] + ol = self.append_bits(out, ol, cl & 0xFFF0, cl & 0x000F, state) - if (h == SHX_SET1 and v == 3): - # was Unicode, will do Binary instead - (temp, bit_no) = readCount(inn, bit_no, len) - out[ol] = 255 - temp # binary - ol += 1 - continue + elif c_in == 10: + ol = self.append_bits(out, ol, self.LF_CODE, self.LF_CODE_LEN, state) # LF + elif c_in == '\t': + ol = self.append_bits(out, ol, self.TAB_CODE, self.TAB_CODE_LEN, state) # TAB + else: + ol = self.append_bits(out, ol, self.BIN_CODE_TASMOTA, self.BIN_CODE_TASMOTA_LEN, state) # Binary, we reuse the Unicode marker which 3 bits instead of 9 + ol = self.encodeCount(out, ol, (255 - c_in) & 0xFF) - if (h < 7 and v < 11): # TODO: are these the actual limits? Not 11x7 ? - #print("h {h} v {v}".format(h=h,v=v)) - c = ord(sets[h][v]) # TODO - if (c >= ord('a') and c <= ord('z')): - if (is_upper): - c -= 32 # go to UpperCase for letters - else: # handle all other cases - if (is_upper and dstate == SHX_SET1 and v == 1): - c = ord('\t') # If UpperCase Space, change to TAB - if (h == SHX_SET1B): - if (8 == v): # was LF or RPT, now only LF - out[ol] = ord('\n') - ol += 1 - continue - if (9 == v): # was CRLF, now RPT - (count, bit_no) = readCount(inn, bit_no, len) - count += 4 - if (ol + count >= len_out): - return -1 # overflow + # check that we have some headroom in the output buffer + if ol // 8 >= len_out - 4: + return -1 # we risk overflow and crash - rpt_c = out[ol - 1] - while (count): + l += 1 + + bits = ol % 8 + if bits: + ol = self.append_bits(out, ol, self.TERM_CODE, 8 - bits, 1) # 0011 0111 1100 0000 TERM = 0011 0111 11 + return (ol + 7) // 8 + # return ol // 8 + 1 if (ol%8) else 0 + + + def getBitVal(self, inn, bit_no, count): + c_in = inn[bit_no >> 3] + if bit_no >> 3 and self.ESCAPE_MARKER == inn[(bit_no >> 3) - 1]: + c_in -= 1 + r = 1 << count if (c_in & (0x80 >> (bit_no % 8))) else 0 + #print("getBitVal r={r}".format(r=r)) + return r + + # Returns: + # 0..11 + # or -1 if end of stream + def getCodeIdx(self, code_type, inn, len_, bit_no_p): + code = 0 + count = 0 + while count < 5: + # detect marker + if self.ESCAPE_MARKER == inn[bit_no_p >> 3]: + bit_no_p += 8 # skip marker + + if bit_no_p >= len_: + return -1, bit_no_p + + code += self.getBitVal(inn, bit_no_p, count) + bit_no_p += 1 + count += 1 + code_type_code = code_type[code] + if code_type_code and (code_type_code & 0x07) == count: + #print("getCodeIdx = {r}".format(r=code_type_code >> 3)) + return code_type_code >> 3, bit_no_p + + #print("getCodeIdx not found = {r}".format(r=1)) + return 1, bit_no_p + + def getNumFromBits(self, inn, bit_no, count): + ret = 0 + while count: count -= 1 - out[ol] = rpt_c + if self.ESCAPE_MARKER == inn[bit_no >> 3]: + bit_no += 8 # skip marker + ret += self.getBitVal(inn, bit_no, count) + bit_no += 1 + return ret + + def readCount(self, inn, bit_no_p, len_): + (idx, bit_no_p) = self.getCodeIdx(self.us_hcode, inn, len_, bit_no_p) + if idx >= 1: + idx -= 1 # we skip v = 1 (code '0') since we no more accept 2 bits encoding + if idx >= 5 or idx < 0: + return 0, bit_no_p # unsupported or end of stream + till = 0 + bit_len_idx = 0 + base = 0 + #for (uint32_t i = 0; i <= idx; i++) { + i = 0 + while i <= idx: + # for i in range(idx): + base = till + bit_len_idx = self.bit_len[i] + till += (1 << bit_len_idx) + i += 1 + + count = self.getNumFromBits(inn, bit_no_p, bit_len_idx) + base + #print("readCount getNumFromBits = {count} ({bl})".format(count=count,bl=bit_len_idx)) + + bit_no_p += bit_len_idx + return count, bit_no_p + + def decodeRepeat(self, inn, len_, out, ol, bit_no): + #print("decodeRepeat Enter") + (dict_len, bit_no) = self.readCount(inn, bit_no, len_) + dict_len += self.NICE_LEN + (dist, bit_no) = self.readCount(inn, bit_no, len_) + dist += self.NICE_LEN - 1 + #memcpy(out + ol, out + ol - dist, dict_len); + i = 0 + while i < dict_len: + #for i in range(dict_len): + out[ol + i] = out[ol - dist + i] + i += 1 + ol += dict_len + + return ol, bit_no + + def decompress(self, inn, len_, out, len_out): + ol = 0 + bit_no = 0 + dstate = self.SHX_SET1 + is_all_upper = 0 + + len_ <<= 3 # *8, len_ in bits + out[ol] = 0 + while bit_no < len_: + c = 0 + is_upper = is_all_upper + (v, bit_no) = self.getCodeIdx(self.us_vcode, inn, len_, bit_no) # read vCode + #print("bit_no {b}. v = {v}".format(b=bit_no,v=v)) + if v < 0: + break # end of stream + h = dstate # Set1 or Set2 + if v == 0: # Switch which is common to Set1 and Set2, first entry + (h, bit_no) = self.getCodeIdx(self.us_hcode, inn, len_, bit_no) # read hCode + #print("bit_no {b}. h = {h}".format(b=bit_no,h=h)) + if h < 0: + break # end of stream + if h == self.SHX_SET1: # target is Set1 + if dstate == self.SHX_SET1: # Switch from Set1 to Set1 us UpperCase + if is_all_upper: # if CapsLock, then back to LowerCase + is_upper = 0 + is_all_upper = 0 + continue + + (v, bit_no) = self.getCodeIdx(self.us_vcode, inn, len_, bit_no) # read again vCode + if v < 0: + break # end of stream + if v == 0: + (h, bit_no) = self.getCodeIdx(self.us_hcode, inn, len_, bit_no) # read second hCode + if h < 0: + break # end of stream + if h == self.SHX_SET1: # If double Switch Set1, the CapsLock + is_all_upper = 1 + continue + + is_upper = 1 # anyways, still uppercase + else: + dstate = self.SHX_SET1 # if Set was not Set1, switch to Set1 + continue + + elif h == self.SHX_SET2: # If Set2, switch dstate to Set2 + if dstate == self.SHX_SET1: + dstate = self.SHX_SET2 + continue + + if h != self.SHX_SET1: # all other Sets (why not else) + (v, bit_no) = self.getCodeIdx(self.us_vcode, inn, len_, bit_no) # we changed set, now read vCode for char + if v < 0: + break # end of stream + + if v == 0 and h == self.SHX_SET1A: + #print("v = 0, h = self.SHX_SET1A") + if is_upper: + (temp, bit_no) = self.readCount(inn, bit_no, len_) + out[ol] = 255 - temp # binary + ol += 1 + else: + (ol, bit_no) = self.decodeRepeat(inn, len_, out, ol, bit_no) # dist + continue + + if h == self.SHX_SET1 and v == 3: + # was Unicode, will do Binary instead + (temp, bit_no) = self.readCount(inn, bit_no, len_) + out[ol] = 255 - temp # binary + ol += 1 + continue + + if h < 7 and v < 11: + #print("h {h} v {v}".format(h=h,v=v)) + c = ord(self.sets[h][v]) + if ord('a') <= c <= ord('z'): + if is_upper: + c -= 32 # go to UpperCase for letters + else: # handle all other cases + if is_upper and dstate == self.SHX_SET1 and v == 1: + c = ord('\t') # If UpperCase Space, change to TAB + if h == self.SHX_SET1B: + if 8 == v: # was LF or RPT, now only LF # pylint: disable=misplaced-comparison-constant + out[ol] = ord('\n') + ol += 1 + continue + + if 9 == v: # was CRLF, now RPT # pylint: disable=misplaced-comparison-constant + (count, bit_no) = self.readCount(inn, bit_no, len_) + count += 4 + if ol + count >= len_out: + return -1 # overflow + + rpt_c = out[ol - 1] + while count: + count -= 1 + out[ol] = rpt_c + ol += 1 + continue + + if 10 == v: # pylint: disable=misplaced-comparison-constant + break # TERM, stop decoding + + out[ol] = c ol += 1 - continue - if (10 == v): - break # TERM, stop decoding + if ol >= len_out: + return -1 # overflow - out[ol] = c - ol += 1 - - if (ol >= len_out): - return -1 # overflow - - return ol + return ol + # pylint: enable=missing-function-docstring if __name__ == "__main__": - inn = bytearray(b'ON Switch1#State==1 DO Add1 1 ENDON ON Var1#State==0 DO ShutterStop1 ENDON ON Var1#State==1 DO ShutterClose1 ENDON ON Var1#State>=2 DO Var1 0 ENDON ON Shutter1#Close DO Var1 0 ENDON ON Switch2#State==1 DO Add2 1 ENDON ON Var2#State==0 DO ShutterStop1 ENDON ON Var2#State==1 DO ShutterOpen1 ENDON ON Var2#State>=2 DO Var2 0 ENDON ON Shutter1#Open DO Var2 0 ENDON') - b = bytearray(2048) - l = unishox_compress(inn, len(inn), b, len(b)) - print("Compressed from {fromm} to {to} ({p}%)".format(fromm=len(inn),to=l,p=(100-l/len(inn)*100))) + # pylint: disable=line-too-long + UNISHOX = Unishox() + BYTES_ = bytearray(2048) + INN = bytearray(b'ON Switch1#State==1 DO Add1 1 ENDON ON Var1#State==0 DO ShutterStop1 ENDON ON Var1#State==1 DO ShutterClose1 ENDON ON Var1#State>=2 DO Var1 0 ENDON ON Shutter1#Close DO Var1 0 ENDON ON Switch2#State==1 DO Add2 1 ENDON ON Var2#State==0 DO ShutterStop1 ENDON ON Var2#State==1 DO ShutterOpen1 ENDON ON Var2#State>=2 DO Var2 0 ENDON ON Shutter1#Open DO Var2 0 ENDON') + LEN_ = UNISHOX.compress(INN, len(INN), BYTES_, len(BYTES_)) + print("Compressed from {fromm} to {to} ({p}%)".format(fromm=len(INN), to=LEN_, p=(100-LEN_/len(INN)*100))) - out = bytearray(2048) - l = unishox_decompress(b, l, out, len(out)) \ No newline at end of file + OUT = bytearray(2048) + LEN_ = UNISHOX.decompress(BYTES_, LEN_, OUT, len(OUT)) + print(str(OUT, 'utf-8').split('\x00')[0])