All files / ethers.js/src.ts/utils utf8.ts

100% Statements 324/324
100% Branches 60/60
100% Functions 8/8
100% Lines 324/324

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 3261x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 9x 9x 9x 1x 17x 17x 17x 17x 6x 6x 10x 4x 4x 6x 6x 11x 11x 11x 17x 2x 2x 9x 9x 9x 9x 1x 9x 9x 9x 9x 1x 1x 1x 1x 8x 8x 8x 8x 8x 8x 8x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 21409x 21409x 21409x 21409x 21409x 21409x 21409x 21409x 21409x 21409x 3597614x 3597614x 3597614x 3597614x 3597614x 3554029x 3554029x 3554029x 43585x 43585x 43585x 43585x 43585x 43585x 3597614x 9548x 9548x 9548x 9548x 3597614x 24174x 24174x 24174x 24174x 34037x 9854x 9854x 9854x 9863x 9x 3x 9x 6x 6x 6x 6x 43576x 43576x 3597614x 3x 3x 3x 43573x 43573x 43573x 43573x 3597614x 87449x 87449x 87449x 87449x 3x 3x 3x 3x 87446x 87446x 87446x 87446x 43572x 43572x 3597614x 43570x 43570x 3597614x 3x 3x 3x 43567x 43567x 3597614x 6x 6x 6x 43561x 43561x 3597614x 3x 3x 3x 43558x 43558x 43558x 21400x 21400x 21400x 1x 1x 1x 1x 1x 1x 1x 1x 1x 65976x 65976x 65976x 16859x 16859x 16859x 65976x 65976x 65976x 2289610x 2289610x 2289610x 1816528x 1816528x 2289610x 51175x 51175x 51175x 473082x 123951x 123951x 123951x 123951x 123951x 123951x 123951x 123951x 123951x 123951x 123951x 123951x 123951x 421907x 297956x 297956x 297956x 297956x 2289610x 65974x 65974x 65974x 1x 1x 21399x 21399x 3597579x 3587731x 3587731x 9848x 9848x 9848x 9848x 9848x 21399x 21399x 1x 1x 1x 1x 1x 1x 1x 1x 1x 21408x 21408x 1x 1x 1x 1x 1x 1x 1x 1x 1x    
/**
 *  Using strings in Ethereum (or any security-basd system) requires
 *  additional care. These utilities attempt to mitigate some of the
 *  safety issues as well as provide the ability to recover and analyse
 *  strings.
 *
 *  @_subsection api/utils:Strings and UTF-8  [about-strings]
 */
import { getBytes } from "./data.js";
import { assertArgument, assertNormalize } from "./errors.js";
 
import type { BytesLike } from "./index.js";
 
 
///////////////////////////////
 
/**
 *  The stanard normalization forms.
 */
export type UnicodeNormalizationForm = "NFC" | "NFD" | "NFKC" | "NFKD";
 
/**
 *  When using the UTF-8 error API the following errors can be intercepted
 *  and processed as the %%reason%% passed to the [[Utf8ErrorFunc]].
 *
 *  **``"UNEXPECTED_CONTINUE"``** - a continuation byte was present where there
 *  was nothing to continue.
 *
 *  **``"BAD_PREFIX"``** - an invalid (non-continuation) byte to start a
 *  UTF-8 codepoint was found.
 *
 *  **``"OVERRUN"``** - the string is too short to process the expected
 *  codepoint length.
 *
 *  **``"MISSING_CONTINUE"``** - a missing continuation byte was expected but
 *  not found. The %%offset%% indicates the index the continuation byte
 *  was expected at.
 *
 *  **``"OUT_OF_RANGE"``** - the computed code point is outside the range
 *  for UTF-8. The %%badCodepoint%% indicates the computed codepoint, which was
 *  outside the valid UTF-8 range.
 *
 *  **``"UTF16_SURROGATE"``** - the UTF-8 strings contained a UTF-16 surrogate
 *  pair. The %%badCodepoint%% is the computed codepoint, which was inside the
 *  UTF-16 surrogate range.
 *
 *  **``"OVERLONG"``** - the string is an overlong representation. The
 *  %%badCodepoint%% indicates the computed codepoint, which has already
 *  been bounds checked.
 *
 *
 *  @returns string
 */
export type Utf8ErrorReason = "UNEXPECTED_CONTINUE" | "BAD_PREFIX" | "OVERRUN" |
    "MISSING_CONTINUE" | "OUT_OF_RANGE" | "UTF16_SURROGATE" | "OVERLONG";
 
 
/**
 *  A callback that can be used with [[toUtf8String]] to analysis or
 *  recovery from invalid UTF-8 data.
 *
 *  Parsing UTF-8 data is done through a simple Finite-State Machine (FSM)
 *  which calls the ``Utf8ErrorFunc`` if a fault is detected.
 *
 *  The %%reason%% indicates where in the FSM execution the fault
 *  occurred and the %%offset%% indicates where the input failed.
 *
 *  The %%bytes%% represents the raw UTF-8 data that was provided and
 *  %%output%% is the current array of UTF-8 code-points, which may
 *  be updated by the ``Utf8ErrorFunc``.
 *
 *  The value of the %%badCodepoint%% depends on the %%reason%%. See
 *  [[Utf8ErrorReason]] for details.
 *
 *  The function should return the number of bytes that should be skipped
 *  when control resumes to the FSM.
 */
export type Utf8ErrorFunc = (reason: Utf8ErrorReason, offset: number, bytes: Uint8Array, output: Array<number>, badCodepoint?: number) => number;
 
 
function errorFunc(reason: Utf8ErrorReason, offset: number, bytes: Uint8Array, output: Array<number>, badCodepoint?: number): number {
    assertArgument(false, `invalid codepoint at offset ${ offset }; ${ reason }`, "bytes", bytes);
}
 
function ignoreFunc(reason: Utf8ErrorReason, offset: number, bytes: Uint8Array, output: Array<number>, badCodepoint?: number): number {
 
    // If there is an invalid prefix (including stray continuation), skip any additional continuation bytes
    if (reason === "BAD_PREFIX" || reason === "UNEXPECTED_CONTINUE") {
        let i = 0;
        for (let o = offset + 1; o < bytes.length; o++) {
            if (bytes[o] >> 6 !== 0x02) { break; }
            i++;
        }
        return i;
    }
 
    // This byte runs us past the end of the string, so just jump to the end
    // (but the first byte was read already read and therefore skipped)
    if (reason === "OVERRUN") {
        return bytes.length - offset - 1;
    }
 
    // Nothing to skip
    return 0;
}
 
function replaceFunc(reason: Utf8ErrorReason, offset: number, bytes: Uint8Array, output: Array<number>, badCodepoint?: number): number {
 
    // Overlong representations are otherwise "valid" code points; just non-deistingtished
    if (reason === "OVERLONG") {
        assertArgument(typeof(badCodepoint) === "number", "invalid bad code point for replacement", "badCodepoint", badCodepoint);
        output.push(badCodepoint);
        return 0;
    }
 
    // Put the replacement character into the output
    output.push(0xfffd);
 
    // Otherwise, process as if ignoring errors
    return ignoreFunc(reason, offset, bytes, output, badCodepoint);
}
 
/**
 *  A handful of popular, built-in UTF-8 error handling strategies.
 *
 *  **``"error"``** - throws on ANY illegal UTF-8 sequence or
 *  non-canonical (overlong) codepoints (this is the default)
 *
 *  **``"ignore"``** - silently drops any illegal UTF-8 sequence
 *  and accepts non-canonical (overlong) codepoints
 *
 *  **``"replace"``** - replace any illegal UTF-8 sequence with the
 *  UTF-8 replacement character (i.e. ``"\\ufffd"``) and accepts
 *  non-canonical (overlong) codepoints
 *
 *  @returns: Record<"error" | "ignore" | "replace", Utf8ErrorFunc>
 */
export const Utf8ErrorFuncs: Readonly<Record<"error" | "ignore" | "replace", Utf8ErrorFunc>> = Object.freeze({
    error: errorFunc,
    ignore: ignoreFunc,
    replace: replaceFunc
});
 
// http://stackoverflow.com/questions/13356493/decode-utf-8-with-javascript#13691499
function getUtf8CodePoints(_bytes: BytesLike, onError?: Utf8ErrorFunc): Array<number> {
    if (onError == null) { onError = Utf8ErrorFuncs.error; }
 
    const bytes = getBytes(_bytes, "bytes");
 
    const result: Array<number> = [];
    let i = 0;
 
    // Invalid bytes are ignored
    while(i < bytes.length) {
 
        const c = bytes[i++];
 
        // 0xxx xxxx
        if (c >> 7 === 0) {
            result.push(c);
            continue;
        }
 
        // Multibyte; how many bytes left for this character?
        let extraLength: null | number = null;
        let overlongMask: null | number = null;
 
        // 110x xxxx 10xx xxxx
        if ((c & 0xe0) === 0xc0) {
            extraLength = 1;
            overlongMask = 0x7f;
 
        // 1110 xxxx 10xx xxxx 10xx xxxx
        } else if ((c & 0xf0) === 0xe0) {
            extraLength = 2;
            overlongMask = 0x7ff;
 
        // 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
        } else if ((c & 0xf8) === 0xf0) {
            extraLength = 3;
            overlongMask = 0xffff;
 
        } else {
            if ((c & 0xc0) === 0x80) {
                i += onError("UNEXPECTED_CONTINUE", i - 1, bytes, result);
            } else {
                i += onError("BAD_PREFIX", i - 1, bytes, result);
            }
            continue;
        }
 
        // Do we have enough bytes in our data?
        if (i - 1 + extraLength >= bytes.length) {
            i += onError("OVERRUN", i - 1, bytes, result);
            continue;
        }
 
        // Remove the length prefix from the char
        let res: null | number = c & ((1 << (8 - extraLength - 1)) - 1);
 
        for (let j = 0; j < extraLength; j++) {
            let nextChar = bytes[i];
 
            // Invalid continuation byte
            if ((nextChar & 0xc0) != 0x80) {
                i += onError("MISSING_CONTINUE", i, bytes, result);
                res = null;
                break;
            };
 
            res = (res << 6) | (nextChar & 0x3f);
            i++;
        }
 
        // See above loop for invalid continuation byte
        if (res === null) { continue; }
 
        // Maximum code point
        if (res > 0x10ffff) {
            i += onError("OUT_OF_RANGE", i - 1 - extraLength, bytes, result, res);
            continue;
        }
 
        // Reserved for UTF-16 surrogate halves
        if (res >= 0xd800 && res <= 0xdfff) {
            i += onError("UTF16_SURROGATE", i - 1 - extraLength, bytes, result, res);
            continue;
        }
 
        // Check for overlong sequences (more bytes than needed)
        if (res <= overlongMask) {
            i += onError("OVERLONG", i - 1 - extraLength, bytes, result, res);
            continue;
        }
 
        result.push(res);
    }
 
    return result;
}
 
// http://stackoverflow.com/questions/18729405/how-to-convert-utf8-string-to-byte-array
 
/**
 *  Returns the UTF-8 byte representation of %%str%%.
 *
 *  If %%form%% is specified, the string is normalized.
 */
export function toUtf8Bytes(str: string, form?: UnicodeNormalizationForm): Uint8Array {
    assertArgument(typeof(str) === "string", "invalid string value", "str", str);
 
    if (form != null) {
        assertNormalize(form);
        str = str.normalize(form);
    }
 
    let result: Array<number> = [];
    for (let i = 0; i < str.length; i++) {
        const c = str.charCodeAt(i);
 
        if (c < 0x80) {
            result.push(c);
 
        } else if (c < 0x800) {
            result.push((c >> 6) | 0xc0);
            result.push((c & 0x3f) | 0x80);
 
        } else if ((c & 0xfc00) == 0xd800) {
            i++;
            const c2 = str.charCodeAt(i);
 
            assertArgument(i < str.length && ((c2 & 0xfc00) === 0xdc00),
                "invalid surrogate pair", "str", str);
 
            // Surrogate Pair
            const pair = 0x10000 + ((c & 0x03ff) << 10) + (c2 & 0x03ff);
            result.push((pair >> 18) | 0xf0);
            result.push(((pair >> 12) & 0x3f) | 0x80);
            result.push(((pair >> 6) & 0x3f) | 0x80);
            result.push((pair & 0x3f) | 0x80);
 
        } else {
            result.push((c >> 12) | 0xe0);
            result.push(((c >> 6) & 0x3f) | 0x80);
            result.push((c & 0x3f) | 0x80);
        }
    }
 
    return new Uint8Array(result);
};
 
//export 
function _toUtf8String(codePoints: Array<number>): string {
    return codePoints.map((codePoint) => {
        if (codePoint <= 0xffff) {
            return String.fromCharCode(codePoint);
        }
        codePoint -= 0x10000;
        return String.fromCharCode(
            (((codePoint >> 10) & 0x3ff) + 0xd800),
            ((codePoint & 0x3ff) + 0xdc00)
        );
    }).join("");
}
 
/**
 *  Returns the string represented by the UTF-8 data %%bytes%%.
 *
 *  When %%onError%% function is specified, it is called on UTF-8
 *  errors allowing recovery using the [[Utf8ErrorFunc]] API.
 *  (default: [error](Utf8ErrorFuncs))
 */
export function toUtf8String(bytes: BytesLike, onError?: Utf8ErrorFunc): string {
    return _toUtf8String(getUtf8CodePoints(bytes, onError));
}
 
/**
 *  Returns the UTF-8 code-points for %%str%%.
 *
 *  If %%form%% is specified, the string is normalized.
 */
export function toUtf8CodePoints(str: string, form?: UnicodeNormalizationForm): Array<number> {
    return getUtf8CodePoints(toUtf8Bytes(str, form));
}