V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
unicode-decoder.h
1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_UNICODE_DECODER_H_
6 #define V8_UNICODE_DECODER_H_
7 
8 #include <sys/types.h>
9 #include <algorithm>
10 #include "src/globals.h"
11 #include "src/unicode.h"
12 #include "src/utils.h"
13 #include "src/vector.h"
14 
15 namespace unibrow {
16 
17 class Utf8Iterator {
18  public:
19  explicit Utf8Iterator(const v8::internal::Vector<const char>& stream)
20  : Utf8Iterator(stream, 0, false) {}
21  Utf8Iterator(const v8::internal::Vector<const char>& stream, size_t offset,
22  bool trailing)
23  : stream_(stream),
24  cursor_(offset),
25  offset_(0),
26  char_(0),
27  trailing_(false) {
28  DCHECK_LE(offset, stream.length());
29  // Read the first char, setting offset_ to offset in the process.
30  ++*this;
31 
32  // This must be set after reading the first char, since the offset marks
33  // the start of the octet sequence that the trailing char is part of.
34  trailing_ = trailing;
35  if (trailing) {
36  DCHECK_GT(char_, Utf16::kMaxNonSurrogateCharCode);
37  }
38  }
39 
40  uint16_t operator*();
41  Utf8Iterator& operator++();
42  Utf8Iterator operator++(int);
43  bool Done();
44  bool Trailing() { return trailing_; }
45  size_t Offset() { return offset_; }
46 
47  private:
48  const v8::internal::Vector<const char>& stream_;
49  size_t cursor_;
50  size_t offset_;
51  uint32_t char_;
52  bool trailing_;
53 };
54 
55 class V8_EXPORT_PRIVATE Utf8DecoderBase {
56  public:
57  // Initialization done in subclass.
58  inline Utf8DecoderBase();
59  inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
60  const v8::internal::Vector<const char>& stream);
61  inline size_t Utf16Length() const { return utf16_length_; }
62 
63  protected:
64  // This reads all characters and sets the utf16_length_.
65  // The first buffer_length utf16 chars are cached in the buffer.
66  void Reset(uint16_t* buffer, size_t buffer_length,
67  const v8::internal::Vector<const char>& vector);
68  static void WriteUtf16Slow(uint16_t* data, size_t length,
70  size_t offset, bool trailing);
71 
72  size_t bytes_read_;
73  size_t chars_written_;
74  size_t utf16_length_;
75  bool trailing_;
76 
77  private:
78  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
79 };
80 
81 template <size_t kBufferSize>
82 class Utf8Decoder : public Utf8DecoderBase {
83  public:
84  inline Utf8Decoder() = default;
85  explicit inline Utf8Decoder(const v8::internal::Vector<const char>& stream);
86  inline void Reset(const v8::internal::Vector<const char>& stream);
87  inline size_t WriteUtf16(
88  uint16_t* data, size_t length,
89  const v8::internal::Vector<const char>& stream) const;
90 
91  private:
92  uint16_t buffer_[kBufferSize];
93 };
94 
95 Utf8DecoderBase::Utf8DecoderBase()
96  : bytes_read_(0), chars_written_(0), utf16_length_(0), trailing_(false) {}
97 
98 Utf8DecoderBase::Utf8DecoderBase(
99  uint16_t* buffer, size_t buffer_length,
100  const v8::internal::Vector<const char>& stream) {
101  Reset(buffer, buffer_length, stream);
102 }
103 
104 template <size_t kBufferSize>
105 Utf8Decoder<kBufferSize>::Utf8Decoder(
106  const v8::internal::Vector<const char>& stream)
107  : Utf8DecoderBase(buffer_, kBufferSize, stream) {}
108 
109 template <size_t kBufferSize>
110 void Utf8Decoder<kBufferSize>::Reset(
111  const v8::internal::Vector<const char>& stream) {
112  Utf8DecoderBase::Reset(buffer_, kBufferSize, stream);
113 }
114 
115 template <size_t kBufferSize>
116 size_t Utf8Decoder<kBufferSize>::WriteUtf16(
117  uint16_t* data, size_t data_length,
118  const v8::internal::Vector<const char>& stream) const {
119  DCHECK_GT(data_length, 0);
120  data_length = std::min(data_length, utf16_length_);
121 
122  // memcpy everything in buffer.
123  size_t memcpy_length = std::min(data_length, chars_written_);
124  v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
125 
126  if (data_length <= chars_written_) return data_length;
127 
128  // Copy the rest the slow way.
129  WriteUtf16Slow(data + chars_written_, data_length - chars_written_, stream,
130  bytes_read_, trailing_);
131  return data_length;
132 }
133 
134 class Latin1 {
135  public:
136  static const unsigned kMaxChar = 0xff;
137  // Convert the character to Latin-1 case equivalent if possible.
138  static inline uint16_t TryConvertToLatin1(uint16_t);
139 };
140 
141 uint16_t Latin1::TryConvertToLatin1(uint16_t c) {
142  switch (c) {
143  // This are equivalent characters in unicode.
144  case 0x39c:
145  case 0x3bc:
146  return 0xb5;
147  // This is an uppercase of a Latin-1 character
148  // outside of Latin-1.
149  case 0x178:
150  return 0xff;
151  }
152  return c;
153 }
154 
155 
156 } // namespace unibrow
157 
158 #endif // V8_UNICODE_DECODER_H_