V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
string-16.cc
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/inspector/string-16.h"
6 
7 #include <algorithm>
8 #include <cctype>
9 #include <cstdlib>
10 #include <cstring>
11 #include <limits>
12 #include <string>
13 
14 #include "src/base/platform/platform.h"
15 #include "src/conversions.h"
16 
17 namespace v8_inspector {
18 
19 namespace {
20 
21 bool isASCII(UChar c) { return !(c & ~0x7F); }
22 
23 bool isSpaceOrNewLine(UChar c) {
24  return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9));
25 }
26 
27 int64_t charactersToInteger(const UChar* characters, size_t length,
28  bool* ok = nullptr) {
29  std::vector<char> buffer;
30  buffer.reserve(length + 1);
31  for (size_t i = 0; i < length; ++i) {
32  if (!isASCII(characters[i])) {
33  if (ok) *ok = false;
34  return 0;
35  }
36  buffer.push_back(static_cast<char>(characters[i]));
37  }
38  buffer.push_back('\0');
39 
40  char* endptr;
41  int64_t result =
42  static_cast<int64_t>(std::strtoll(buffer.data(), &endptr, 10));
43  if (ok) *ok = !(*endptr);
44  return result;
45 }
46 
47 const UChar replacementCharacter = 0xFFFD;
48 using UChar32 = uint32_t;
49 
50 inline int inlineUTF8SequenceLengthNonASCII(char b0) {
51  if ((b0 & 0xC0) != 0xC0) return 0;
52  if ((b0 & 0xE0) == 0xC0) return 2;
53  if ((b0 & 0xF0) == 0xE0) return 3;
54  if ((b0 & 0xF8) == 0xF0) return 4;
55  return 0;
56 }
57 
58 inline int inlineUTF8SequenceLength(char b0) {
59  return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
60 }
61 
62 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
63 // into the first byte, depending on how many bytes follow. There are
64 // as many entries in this table as there are UTF-8 sequence types.
65 // (I.e., one byte sequence, two byte... etc.). Remember that sequences
66 // for *legal* UTF-8 will be 4 or fewer bytes total.
67 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,
68  0xF0, 0xF8, 0xFC};
69 
70 typedef enum {
71  conversionOK, // conversion successful
72  sourceExhausted, // partial character in source, but hit end
73  targetExhausted, // insuff. room in target for conversion
74  sourceIllegal // source sequence is illegal/malformed
75 } ConversionResult;
76 
77 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,
78  const UChar* sourceEnd, char** targetStart,
79  char* targetEnd, bool strict) {
80  ConversionResult result = conversionOK;
81  const UChar* source = *sourceStart;
82  char* target = *targetStart;
83  while (source < sourceEnd) {
84  UChar32 ch;
85  uint32_t bytesToWrite = 0;
86  const UChar32 byteMask = 0xBF;
87  const UChar32 byteMark = 0x80;
88  const UChar* oldSource =
89  source; // In case we have to back up because of target overflow.
90  ch = static_cast<uint16_t>(*source++);
91  // If we have a surrogate pair, convert to UChar32 first.
92  if (ch >= 0xD800 && ch <= 0xDBFF) {
93  // If the 16 bits following the high surrogate are in the source buffer...
94  if (source < sourceEnd) {
95  UChar32 ch2 = static_cast<uint16_t>(*source);
96  // If it's a low surrogate, convert to UChar32.
97  if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
98  ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
99  ++source;
100  } else if (strict) { // it's an unpaired high surrogate
101  --source; // return to the illegal value itself
102  result = sourceIllegal;
103  break;
104  }
105  } else { // We don't have the 16 bits following the high surrogate.
106  --source; // return to the high surrogate
107  result = sourceExhausted;
108  break;
109  }
110  } else if (strict) {
111  // UTF-16 surrogate values are illegal in UTF-32
112  if (ch >= 0xDC00 && ch <= 0xDFFF) {
113  --source; // return to the illegal value itself
114  result = sourceIllegal;
115  break;
116  }
117  }
118  // Figure out how many bytes the result will require
119  if (ch < static_cast<UChar32>(0x80)) {
120  bytesToWrite = 1;
121  } else if (ch < static_cast<UChar32>(0x800)) {
122  bytesToWrite = 2;
123  } else if (ch < static_cast<UChar32>(0x10000)) {
124  bytesToWrite = 3;
125  } else if (ch < static_cast<UChar32>(0x110000)) {
126  bytesToWrite = 4;
127  } else {
128  bytesToWrite = 3;
129  ch = replacementCharacter;
130  }
131 
132  target += bytesToWrite;
133  if (target > targetEnd) {
134  source = oldSource; // Back up source pointer!
135  target -= bytesToWrite;
136  result = targetExhausted;
137  break;
138  }
139  switch (bytesToWrite) {
140  case 4:
141  *--target = static_cast<char>((ch | byteMark) & byteMask);
142  ch >>= 6;
143  V8_FALLTHROUGH;
144  case 3:
145  *--target = static_cast<char>((ch | byteMark) & byteMask);
146  ch >>= 6;
147  V8_FALLTHROUGH;
148  case 2:
149  *--target = static_cast<char>((ch | byteMark) & byteMask);
150  ch >>= 6;
151  V8_FALLTHROUGH;
152  case 1:
153  *--target = static_cast<char>(ch | firstByteMark[bytesToWrite]);
154  }
155  target += bytesToWrite;
156  }
157  *sourceStart = source;
158  *targetStart = target;
159  return result;
160 }
161 
168 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xFFFF)
169 
176 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x010000) <= 0xFFFFF)
177 
184 #define U_IS_SURROGATE(c) (((c)&0xFFFFF800) == 0xD800)
185 
193 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xD7C0)
194 
202 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3FF) | 0xDC00)
203 
204 // This must be called with the length pre-determined by the first byte.
205 // If presented with a length > 4, this returns false. The Unicode
206 // definition of UTF-8 goes up to 4-byte sequences.
207 static bool isLegalUTF8(const unsigned char* source, int length) {
208  unsigned char a;
209  const unsigned char* srcptr = source + length;
210  switch (length) {
211  default:
212  return false;
213  // Everything else falls through when "true"...
214  case 4:
215  if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
216  V8_FALLTHROUGH;
217  case 3:
218  if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
219  V8_FALLTHROUGH;
220  case 2:
221  if ((a = (*--srcptr)) > 0xBF) return false;
222 
223  // no fall-through in this inner switch
224  switch (*source) {
225  case 0xE0:
226  if (a < 0xA0) return false;
227  break;
228  case 0xED:
229  if (a > 0x9F) return false;
230  break;
231  case 0xF0:
232  if (a < 0x90) return false;
233  break;
234  case 0xF4:
235  if (a > 0x8F) return false;
236  break;
237  default:
238  if (a < 0x80) return false;
239  }
240  V8_FALLTHROUGH;
241 
242  case 1:
243  if (*source >= 0x80 && *source < 0xC2) return false;
244  }
245  if (*source > 0xF4) return false;
246  return true;
247 }
248 
249 // Magic values subtracted from a buffer value during UTF8 conversion.
250 // This table contains as many values as there might be trailing bytes
251 // in a UTF-8 sequence.
252 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,
253  0x00003080UL,
254  0x000E2080UL,
255  0x03C82080UL,
256  static_cast<UChar32>(0xFA082080UL),
257  static_cast<UChar32>(0x82082080UL)};
258 
259 static inline UChar32 readUTF8Sequence(const char*& sequence, size_t length) {
260  UChar32 character = 0;
261 
262  // The cases all fall through.
263  switch (length) {
264  case 6:
265  character += static_cast<unsigned char>(*sequence++);
266  character <<= 6;
267  V8_FALLTHROUGH;
268  case 5:
269  character += static_cast<unsigned char>(*sequence++);
270  character <<= 6;
271  V8_FALLTHROUGH;
272  case 4:
273  character += static_cast<unsigned char>(*sequence++);
274  character <<= 6;
275  V8_FALLTHROUGH;
276  case 3:
277  character += static_cast<unsigned char>(*sequence++);
278  character <<= 6;
279  V8_FALLTHROUGH;
280  case 2:
281  character += static_cast<unsigned char>(*sequence++);
282  character <<= 6;
283  V8_FALLTHROUGH;
284  case 1:
285  character += static_cast<unsigned char>(*sequence++);
286  }
287 
288  return character - offsetsFromUTF8[length - 1];
289 }
290 
291 ConversionResult convertUTF8ToUTF16(const char** sourceStart,
292  const char* sourceEnd, UChar** targetStart,
293  UChar* targetEnd, bool* sourceAllASCII,
294  bool strict) {
295  ConversionResult result = conversionOK;
296  const char* source = *sourceStart;
297  UChar* target = *targetStart;
298  UChar orAllData = 0;
299  while (source < sourceEnd) {
300  int utf8SequenceLength = inlineUTF8SequenceLength(*source);
301  if (sourceEnd - source < utf8SequenceLength) {
302  result = sourceExhausted;
303  break;
304  }
305  // Do this check whether lenient or strict
306  if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),
307  utf8SequenceLength)) {
308  result = sourceIllegal;
309  break;
310  }
311 
312  UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
313 
314  if (target >= targetEnd) {
315  source -= utf8SequenceLength; // Back up source pointer!
316  result = targetExhausted;
317  break;
318  }
319 
320  if (U_IS_BMP(character)) {
321  // UTF-16 surrogate values are illegal in UTF-32
322  if (U_IS_SURROGATE(character)) {
323  if (strict) {
324  source -= utf8SequenceLength; // return to the illegal value itself
325  result = sourceIllegal;
326  break;
327  }
328  *target++ = replacementCharacter;
329  orAllData |= replacementCharacter;
330  } else {
331  *target++ = static_cast<UChar>(character); // normal case
332  orAllData |= character;
333  }
334  } else if (U_IS_SUPPLEMENTARY(character)) {
335  // target is a character in range 0xFFFF - 0x10FFFF
336  if (target + 1 >= targetEnd) {
337  source -= utf8SequenceLength; // Back up source pointer!
338  result = targetExhausted;
339  break;
340  }
341  *target++ = U16_LEAD(character);
342  *target++ = U16_TRAIL(character);
343  orAllData = 0xFFFF;
344  } else {
345  if (strict) {
346  source -= utf8SequenceLength; // return to the start
347  result = sourceIllegal;
348  break; // Bail out; shouldn't continue
349  } else {
350  *target++ = replacementCharacter;
351  orAllData |= replacementCharacter;
352  }
353  }
354  }
355  *sourceStart = source;
356  *targetStart = target;
357 
358  if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7F);
359 
360  return result;
361 }
362 
363 // Helper to write a three-byte UTF-8 code point to the buffer, caller must
364 // check room is available.
365 static inline void putUTF8Triple(char*& buffer, UChar ch) {
366  *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
367  *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
368  *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
369 }
370 
371 } // namespace
372 
373 String16::String16() = default;
374 
375 String16::String16(const String16& other) = default;
376 
377 String16::String16(String16&& other) V8_NOEXCEPT
378  : m_impl(std::move(other.m_impl)),
379  hash_code(other.hash_code) {}
380 
381 String16::String16(const UChar* characters, size_t size)
382  : m_impl(characters, size) {}
383 
384 String16::String16(const UChar* characters) : m_impl(characters) {}
385 
386 String16::String16(const char* characters)
387  : String16(characters, std::strlen(characters)) {}
388 
389 String16::String16(const char* characters, size_t size) {
390  m_impl.resize(size);
391  for (size_t i = 0; i < size; ++i) m_impl[i] = characters[i];
392 }
393 
394 String16::String16(const std::basic_string<UChar>& impl) : m_impl(impl) {}
395 
396 String16& String16::operator=(const String16& other) = default;
397 
398 String16& String16::operator=(String16&& other) V8_NOEXCEPT {
399  m_impl = std::move(other.m_impl);
400  hash_code = other.hash_code;
401  return *this;
402 }
403 
404 // static
405 String16 String16::fromInteger(int number) {
406  char arr[50];
407  v8::internal::Vector<char> buffer(arr, arraysize(arr));
408  return String16(IntToCString(number, buffer));
409 }
410 
411 // static
412 String16 String16::fromInteger(size_t number) {
413  const size_t kBufferSize = 50;
414  char buffer[kBufferSize];
415 #if !defined(_WIN32) && !defined(_WIN64)
416  v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number);
417 #else
418  v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number);
419 #endif
420  return String16(buffer);
421 }
422 
423 // static
424 String16 String16::fromDouble(double number) {
425  char arr[50];
426  v8::internal::Vector<char> buffer(arr, arraysize(arr));
427  return String16(DoubleToCString(number, buffer));
428 }
429 
430 // static
431 String16 String16::fromDouble(double number, int precision) {
432  std::unique_ptr<char[]> str(
433  v8::internal::DoubleToPrecisionCString(number, precision));
434  return String16(str.get());
435 }
436 
437 int64_t String16::toInteger64(bool* ok) const {
438  return charactersToInteger(characters16(), length(), ok);
439 }
440 
441 int String16::toInteger(bool* ok) const {
442  int64_t result = toInteger64(ok);
443  if (ok && *ok) {
444  *ok = result <= std::numeric_limits<int>::max() &&
445  result >= std::numeric_limits<int>::min();
446  }
447  return static_cast<int>(result);
448 }
449 
450 String16 String16::stripWhiteSpace() const {
451  if (!length()) return String16();
452 
453  size_t start = 0;
454  size_t end = length() - 1;
455 
456  // skip white space from start
457  while (start <= end && isSpaceOrNewLine(characters16()[start])) ++start;
458 
459  // only white space
460  if (start > end) return String16();
461 
462  // skip white space from end
463  while (end && isSpaceOrNewLine(characters16()[end])) --end;
464 
465  if (!start && end == length() - 1) return *this;
466  return String16(characters16() + start, end + 1 - start);
467 }
468 
469 String16Builder::String16Builder() = default;
470 
471 void String16Builder::append(const String16& s) {
472  m_buffer.insert(m_buffer.end(), s.characters16(),
473  s.characters16() + s.length());
474 }
475 
476 void String16Builder::append(UChar c) { m_buffer.push_back(c); }
477 
478 void String16Builder::append(char c) {
479  UChar u = c;
480  m_buffer.push_back(u);
481 }
482 
483 void String16Builder::append(const UChar* characters, size_t length) {
484  m_buffer.insert(m_buffer.end(), characters, characters + length);
485 }
486 
487 void String16Builder::append(const char* characters, size_t length) {
488  m_buffer.insert(m_buffer.end(), characters, characters + length);
489 }
490 
491 void String16Builder::appendNumber(int number) {
492  constexpr int kBufferSize = 11;
493  char buffer[kBufferSize];
494  int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%d", number);
495  DCHECK_LE(0, chars);
496  m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
497 }
498 
499 void String16Builder::appendNumber(size_t number) {
500  constexpr int kBufferSize = 20;
501  char buffer[kBufferSize];
502 #if !defined(_WIN32) && !defined(_WIN64)
503  int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number);
504 #else
505  int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number);
506 #endif
507  DCHECK_LE(0, chars);
508  m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
509 }
510 
511 void String16Builder::appendUnsignedAsHex(uint64_t number) {
512  constexpr int kBufferSize = 17;
513  char buffer[kBufferSize];
514  int chars =
515  v8::base::OS::SNPrintF(buffer, kBufferSize, "%016" PRIx64, number);
516  DCHECK_LE(0, chars);
517  m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
518 }
519 
520 void String16Builder::appendUnsignedAsHex(uint32_t number) {
521  constexpr int kBufferSize = 9;
522  char buffer[kBufferSize];
523  int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%08" PRIx32, number);
524  DCHECK_LE(0, chars);
525  m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
526 }
527 
528 String16 String16Builder::toString() {
529  return String16(m_buffer.data(), m_buffer.size());
530 }
531 
532 void String16Builder::reserveCapacity(size_t capacity) {
533  m_buffer.reserve(capacity);
534 }
535 
536 String16 String16::fromUTF8(const char* stringStart, size_t length) {
537  if (!stringStart || !length) return String16();
538 
539  std::vector<UChar> buffer(length);
540  UChar* bufferStart = buffer.data();
541 
542  UChar* bufferCurrent = bufferStart;
543  const char* stringCurrent = stringStart;
544  if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,
545  bufferCurrent + buffer.size(), nullptr,
546  true) != conversionOK)
547  return String16();
548 
549  size_t utf16Length = bufferCurrent - bufferStart;
550  return String16(bufferStart, utf16Length);
551 }
552 
553 std::string String16::utf8() const {
554  size_t length = this->length();
555 
556  if (!length) return std::string("");
557 
558  // Allocate a buffer big enough to hold all the characters
559  // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
560  // Optimization ideas, if we find this function is hot:
561  // * We could speculatively create a CStringBuffer to contain 'length'
562  // characters, and resize if necessary (i.e. if the buffer contains
563  // non-ascii characters). (Alternatively, scan the buffer first for
564  // ascii characters, so we know this will be sufficient).
565  // * We could allocate a CStringBuffer with an appropriate size to
566  // have a good chance of being able to write the string into the
567  // buffer without reallocing (say, 1.5 x length).
568  if (length > std::numeric_limits<unsigned>::max() / 3) return std::string();
569  std::vector<char> bufferVector(length * 3);
570  char* buffer = bufferVector.data();
571  const UChar* characters = m_impl.data();
572 
573  ConversionResult result =
574  convertUTF16ToUTF8(&characters, characters + length, &buffer,
575  buffer + bufferVector.size(), false);
576  DCHECK(
577  result !=
578  targetExhausted); // (length * 3) should be sufficient for any conversion
579 
580  // Only produced from strict conversion.
581  DCHECK(result != sourceIllegal);
582 
583  // Check for an unconverted high surrogate.
584  if (result == sourceExhausted) {
585  // This should be one unpaired high surrogate. Treat it the same
586  // was as an unpaired high surrogate would have been handled in
587  // the middle of a string with non-strict conversion - which is
588  // to say, simply encode it to UTF-8.
589  DCHECK((characters + 1) == (m_impl.data() + length));
590  DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF));
591  // There should be room left, since one UChar hasn't been
592  // converted.
593  DCHECK((buffer + 3) <= (buffer + bufferVector.size()));
594  putUTF8Triple(buffer, *characters);
595  }
596 
597  return std::string(bufferVector.data(), buffer - bufferVector.data());
598 }
599 
600 } // namespace v8_inspector
STL namespace.