https://github.com/akkartik/mu/blob/main/linux/403unicode.mu
  1 # Helpers for Unicode.
  2 #
  3 # Mu has no characters, only code points and code-point-utf8s.
  4 # Code points are the indivisible atoms of text streams.
  5 #   https://en.wikipedia.org/wiki/Code_point
  6 # Graphemes are the smallest self-contained unit of text.
  7 # Graphemes may consist of multiple code points.
  8 #
  9 # Mu code-point-utf8s are always represented in utf-8, and they are required to fit
 10 # in 4 bytes.
 11 #
 12 # Mu doesn't currently support combining code points, or code-point-utf8s made of
 13 # multiple code points. One day we will.
 14 # On Linux, we also don't currently support code points that translate into
 15 # multiple or wide code-point-utf8s. (In particular, Tab will never be supported.)
 16 
 17 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
 18 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
 19 fn to-utf8 in: code-point -> _/eax: code-point-utf8 {
 20   var c/eax: int <- copy in
 21   var num-trailers/ecx: int <- copy 0
 22   var first/edx: int <- copy 0
 23   $to-utf8:compute-length: {
 24     # single byte: just return it
 25     compare c, 0x7f
 26     {
 27       break-if->
 28       var g/eax: code-point-utf8 <- copy c
 29       return g
 30     }
 31     # 2 bytes
 32     compare c, 0x7ff
 33     {
 34       break-if->
 35       num-trailers <- copy 1
 36       first <- copy 0xc0
 37       break $to-utf8:compute-length
 38     }
 39     # 3 bytes
 40     compare c, 0xffff
 41     {
 42       break-if->
 43       num-trailers <- copy 2
 44       first <- copy 0xe0
 45       break $to-utf8:compute-length
 46     }
 47     # 4 bytes
 48     compare c, 0x1fffff
 49     {
 50       break-if->
 51       num-trailers <- copy 3
 52       first <- copy 0xf0
 53       break $to-utf8:compute-length
 54     }
 55     # more than 4 bytes: unsupported
 56     # TODO: print to stderr
 57     compare c, 0x1fffff
 58     {
 59       break-if->
 60       print-string-to-real-screen "unsupported code point "
 61       print-int32-hex-to-real-screen c
 62       print-string-to-real-screen "\n"
 63       var exit-status/ebx: int <- copy 1
 64       syscall_exit
 65     }
 66   }
 67   # emit trailer bytes, 6 bits from 'in', first two bits '10'
 68   var result/edi: code-point-utf8 <- copy 0
 69   {
 70     compare num-trailers, 0
 71     break-if-<=
 72     var tmp/esi: int <- copy c
 73     tmp <- and 0x3f
 74     tmp <- or 0x80
 75     result <- shift-left 8
 76     result <- or tmp
 77     # update loop state
 78     c <- shift-right 6
 79     num-trailers <- decrement
 80     loop
 81   }
 82   # emit engine
 83   result <- shift-left 8
 84   result <- or c
 85   result <- or first
 86   #
 87   return result
 88 }
 89 
 90 # single-byte code point have identical code-point-utf8s
 91 fn test-to-utf8-single-byte {
 92   var in-int/ecx: int <- copy 0
 93   {
 94     compare in-int, 0x7f
 95     break-if->
 96     var in/eax: code-point <- copy in-int
 97     var out/eax: code-point-utf8 <- to-utf8 in
 98     var out-int/eax: int <- copy out
 99     check-ints-equal out-int, in-int, "F - test-to-utf8-single-byte"
100     in-int <- increment
101     loop
102   }
103 }
104 
105                                                               # byte       | byte      | byte      | byte
106 # smallest 2-byte utf-8
107 fn test-to-utf8-two-bytes-min {
108   var in/eax: code-point <- copy 0x80                         #                                 10     00-0000
109   var out/eax: code-point-utf8 <- to-utf8 in
110   var out-int/eax: int <- copy out
111   check-ints-equal out-int, 0x80c2, "F - to-utf8/2a"      #                         110 0-0010  10 00-0000
112 }
113 
114 # largest 2-byte utf-8
115 fn test-to-utf8-two-bytes-max {
116   var in/eax: code-point <- copy 0x7ff                        #                             1-1111     11-1111
117   var out/eax: code-point-utf8 <- to-utf8 in
118   var out-int/eax: int <- copy out
119   check-ints-equal out-int, 0xbfdf, "F - to-utf8/2b"      #                         110 1-1111  10 11-1111
120 }
121 
122 # smallest 3-byte utf-8
123 fn test-to-utf8-three-bytes-min {
124   var in/eax: code-point <- copy 0x800                        #                            10-0000     00-0000
125   var out/eax: code-point-utf8 <- to-utf8 in
126   var out-int/eax: int <- copy out
127   check-ints-equal out-int, 0x80a0e0, "F - to-utf8/3a"    #              1110 0000  10 10-0000  10 00-0000
128 }
129 
130 # largest 3-byte utf-8
131 fn test-to-utf8-three-bytes-max {
132   var in/eax: code-point <- copy 0xffff                       #                   1111     11-1111     11-1111
133   var out/eax: code-point-utf8 <- to-utf8 in
134   var out-int/eax: int <- copy out
135   check-ints-equal out-int, 0xbfbfef, "F - to-utf8/3b"    #              1110 1111  10 11-1111  10 11-1111
136 }
137 
138 # smallest 4-byte utf-8
139 fn test-to-utf8-four-bytes-min {
140   var in/eax: code-point <- copy 0x10000                      #                 1-0000     00-0000     00-0000
141   var out/eax: code-point-utf8 <- to-utf8 in
142   var out-int/eax: int <- copy out
143   check-ints-equal out-int, 0x808090f0, "F - to-utf8/4a"  # 1111-0 000  10 01-0000  10 00-0000  10 00-0000
144 }
145 
146 # largest 4-byte utf-8
147 fn test-to-utf8-four-bytes-max {
148   var in/eax: code-point <- copy 0x1fffff                     #        111     11-1111     11-1111     11-1111
149   var out/eax: code-point-utf8 <- to-utf8 in
150   var out-int/eax: int <- copy out
151   check-ints-equal out-int, 0xbfbfbff7, "F - to-utf8/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
152 }
153 
154 # read the next code-point-utf8 from a stream of bytes
155 fn read-code-point-utf8 in: (addr stream byte) -> _/eax: code-point-utf8 {
156   # if at eof, return EOF
157   {
158     var eof?/eax: boolean <- stream-empty? in
159     compare eof?, 0/false
160     break-if-=
161     return 0xffffffff
162   }
163   var c/eax: byte <- read-byte in
164   var num-trailers/ecx: int <- copy 0
165   $read-code-point-utf8:compute-length: {
166     # single byte: just return it
167     compare c, 0xc0
168     {
169       break-if->=
170       var g/eax: code-point-utf8 <- copy c
171       return g
172     }
173     compare c, 0xfe
174     {
175       break-if-<
176       var g/eax: code-point-utf8 <- copy c
177       return g
178     }
179     # 2 bytes
180     compare c, 0xe0
181     {
182       break-if->=
183       num-trailers <- copy 1
184       break $read-code-point-utf8:compute-length
185     }
186     # 3 bytes
187     compare c, 0xf0
188     {
189       break-if->=
190       num-trailers <- copy 2
191       break $read-code-point-utf8:compute-length
192     }
193     # 4 bytes
194     compare c, 0xf8
195     {
196       break-if->=
197       num-trailers <- copy 3
198       break $read-code-point-utf8:compute-length
199     }
200 $read-code-point-utf8:abort: {
201       # TODO: print to stderr
202       print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not yet supported. First byte seen: "
203       var n/eax: int <- copy c
204       print-int32-hex-to-real-screen n
205       print-string-to-real-screen "\n"
206       var exit-status/ebx: int <- copy 1
207       syscall_exit
208     }
209   }
210   # prepend trailer bytes
211   var result/edi: code-point-utf8 <- copy c
212   var num-byte-shifts/edx: int <- copy 1
213   {
214     compare num-trailers, 0
215     break-if-<=
216     var tmp/eax: byte <- read-byte in
217     var tmp2/eax: int <- copy tmp
218     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
219     result <- or tmp2
220     # update loop state
221     num-byte-shifts <- increment
222     num-trailers <- decrement
223     loop
224   }
225   return result
226 }
227 
228 fn test-read-code-point-utf8 {
229   var s: (stream byte 0x30)
230   var s2/ecx: (addr stream byte) <- address s
231   write s2, "aΒc世d界e"
232   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
233   var n/eax: int <- copy c
234   check-ints-equal n, 0x61, "F - test code-point-utf8/0"
235   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
236   var n/eax: int <- copy c
237   check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test code-point-utf8/1"
238   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
239   var n/eax: int <- copy c
240   check-ints-equal n, 0x63, "F - test code-point-utf8/2"
241   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
242   var n/eax: int <- copy c
243   check-ints-equal n, 0x96b8e4, "F - test code-point-utf8/3"
244   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
245   var n/eax: int <- copy c
246   check-ints-equal n, 0x64, "F - test code-point-utf8/4"
247   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
248   var n/eax: int <- copy c
249   check-ints-equal n, 0x8c95e7, "F - test code-point-utf8/5"
250   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
251   var n/eax: int <- copy c
252   check-ints-equal n, 0x65, "F - test code-point-utf8/6"
253 }
254 
255 fn read-code-point-utf8-buffered in: (addr buffered-file) -> _/eax: code-point-utf8 {
256   var c/eax: byte <- read-byte-buffered in
257   var num-trailers/ecx: int <- copy 0
258   $read-code-point-utf8-buffered:compute-length: {
259     # single byte: just return it
260     compare c, 0xc0
261     {
262       break-if->=
263       var g/eax: code-point-utf8 <- copy c
264       return g
265     }
266     compare c, 0xfe
267     {
268       break-if-<
269       var g/eax: code-point-utf8 <- copy c
270       return g
271     }
272     # 2 bytes
273     compare c, 0xe0
274     {
275       break-if->=
276       num-trailers <- copy 1
277       break $read-code-point-utf8-buffered:compute-length
278     }
279     # 3 bytes
280     compare c, 0xf0
281     {
282       break-if->=
283       num-trailers <- copy 2
284       break $read-code-point-utf8-buffered:compute-length
285     }
286     # 4 bytes
287     compare c, 0xf8
288     {
289       break-if->=
290       num-trailers <- copy 3
291       break $read-code-point-utf8-buffered:compute-length
292     }
293 $read-code-point-utf8-buffered:abort: {
294       # TODO: print to stderr
295       print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
296       var n/eax: int <- copy c
297       print-int32-hex-to-real-screen n
298       print-string-to-real-screen "\n"
299       var exit-status/ebx: int <- copy 1
300       syscall_exit
301     }
302   }
303   # prepend trailer bytes
304   var result/edi: code-point-utf8 <- copy c
305   var num-byte-shifts/edx: int <- copy 1
306   {
307     compare num-trailers, 0
308     break-if-<=
309     var tmp/eax: byte <- read-byte-buffered in
310     var tmp2/eax: int <- copy tmp
311     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
312     result <- or tmp2
313     # update loop state
314     num-byte-shifts <- increment
315     num-trailers <- decrement
316     loop
317   }
318   return result
319 }
320 
321 # needed because available primitives only shift by a literal/constant number of bits
322 fn shift-left-bytes n: int, k: int -> _/eax: int {
323   var i/ecx: int <- copy 0
324   var result/eax: int <- copy n
325   {
326     compare i, k
327     break-if->=
328     compare i, 4  # only 4 bytes in 32 bits
329     break-if->=
330     result <- shift-left 8
331     i <- increment
332     loop
333   }
334   return result
335 }
336 
337 fn test-shift-left-bytes-0 {
338   var result/eax: int <- shift-left-bytes 1, 0
339   check-ints-equal result, 1, "F - shift-left-bytes 0"
340 }
341 
342 fn test-shift-left-bytes-1 {
343   var result/eax: int <- shift-left-bytes 1, 1
344   check-ints-equal result, 0x100, "F - shift-left-bytes 1"
345 }
346 
347 fn test-shift-left-bytes-2 {
348   var result/eax: int <- shift-left-bytes 1, 2
349   check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
350 }
351 
352 fn test-shift-left-bytes-3 {
353   var result/eax: int <- shift-left-bytes 1, 3
354   check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
355 }
356 
357 fn test-shift-left-bytes-4 {
358   var result/eax: int <- shift-left-bytes 1, 4
359   check-ints-equal result, 0, "F - shift-left-bytes 4"
360 }
361 
362 fn test-shift-left-bytes-5 {
363   var result/eax: int <- shift-left-bytes 1, 5
364   check-ints-equal result, 0, "F - shift-left-bytes >4"
365 }
366 
367 # write a code-point-utf8 to a stream of bytes
368 # this is like write-to-stream, except we skip leading 0 bytes
369 fn write-code-point-utf8 out: (addr stream byte), g: code-point-utf8 {
370 $write-code-point-utf8:body: {
371   var c/eax: int <- copy g
372   append-byte out, c  # first byte is always written
373   c <- shift-right 8
374   compare c, 0
375   break-if-= $write-code-point-utf8:body
376   append-byte out, c
377   c <- shift-right 8
378   compare c, 0
379   break-if-= $write-code-point-utf8:body
380   append-byte out, c
381   c <- shift-right 8
382   compare c, 0
383   break-if-= $write-code-point-utf8:body
384   append-byte out, c
385 }
386 }