https://github.com/akkartik/mu/blob/main/403unicode.mu
  1 # Helpers for Unicode.
  2 #
  3 # The basic unit for rendering Unicode is the code point.
  4 #   https://en.wikipedia.org/wiki/Code_point
  5 # The glyph a non-cursive font displays may represent multiple code points.
  6 #
  7 # In addition to raw code points (just integers assigned special meaning), Mu
  8 # provides a common encoding as a convenience: code-point-utf8.
  9 
 10 fn test-unicode-serialization-and-deserialization {
 11   var i/ebx: int <- copy 0
 12   var init?/esi: boolean <- copy 1/true
 13   {
 14     compare i, 0x10000  # 32 bits of utf-8 are sufficient for https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
 15                         # but not emoji
 16     break-if->=
 17     var c/eax: code-point <- copy i
 18     var _g/eax: code-point-utf8 <- to-utf8 c
 19     var g/ecx: code-point-utf8 <- copy _g
 20     var c2/eax: code-point <- to-code-point g
 21     compare i, c2
 22     {
 23       break-if-=
 24       {
 25         compare init?, 0/false
 26         break-if-=
 27         draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "F - test-unicode-serialization-and-deserialization: ", 3/fg 0/bg
 28       }
 29       init? <- copy 0/false
 30       draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, i, 3/fg 0/bg
 31       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
 32       {
 33         var x/eax: int <- copy g
 34         draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x, 3/fg 0/bg
 35       }
 36       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
 37       {
 38         var x2/eax: int <- copy c2
 39         draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x2, 3/fg 0/bg
 40       }
 41       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 3/fg 0/bg
 42     }
 43     i <- add 0xf  # to speed things up; ensure increment is not a power of 2
 44     loop
 45   }
 46 }
 47 
 48 # transliterated from tb_utf8_char_to_unicode in https://github.com/nsf/termbox
 49 fn to-code-point in: code-point-utf8 -> _/eax: code-point {
 50   var g/ebx: int <- copy in
 51   # if single byte, just return it
 52   {
 53     compare g, 0xff
 54     break-if->
 55     var result/eax: code-point <- copy g
 56     return result
 57   }
 58   #
 59   var len/edx: int <- utf8-length in
 60   # extract bits from first byte
 61   var b/eax: byte <- copy-byte g
 62   var result/edi: code-point <- copy b
 63   {
 64     compare len, 2
 65     break-if-!=
 66     result <- and 0x1f
 67   }
 68   {
 69     compare len, 3
 70     break-if-!=
 71     result <- and 0x0f
 72   }
 73   {
 74     compare len, 4
 75     break-if-!=
 76     result <- and 0x07
 77   }
 78   # extract bits from remaining bytes
 79   g <- shift-right 8
 80   var i/ecx: int <- copy 1
 81   {
 82     compare i, len
 83     break-if->=
 84     var b/eax: byte <- copy-byte g
 85     b <- and 0x3f
 86     result <- shift-left 6
 87     result <- or b
 88     g <- shift-right 8
 89     i <- increment
 90     loop
 91   }
 92   return result
 93 }
 94 
 95 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
 96 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
 97 fn to-utf8 in: code-point -> _/eax: code-point-utf8 {
 98   var c/eax: int <- copy in
 99   var num-trailers/ecx: int <- copy 0
100   var first/edx: int <- copy 0
101   $to-utf8:compute-length: {
102     # single byte: just return it
103     compare c, 0x7f
104     {
105       break-if->
106       var g/eax: code-point-utf8 <- copy c
107       return g
108     }
109     # 2 bytes
110     compare c, 0x7ff
111     {
112       break-if->
113       num-trailers <- copy 1
114       first <- copy 0xc0
115       break $to-utf8:compute-length
116     }
117     # 3 bytes
118     compare c, 0xffff
119     {
120       break-if->
121       num-trailers <- copy 2
122       first <- copy 0xe0
123       break $to-utf8:compute-length
124     }
125     # 4 bytes
126     compare c, 0x1fffff
127     {
128       break-if->
129       num-trailers <- copy 3
130       first <- copy 0xf0
131       break $to-utf8:compute-length
132     }
133     # more than 4 bytes: unsupported
134     compare c, 0x1fffff
135     {
136       break-if->
137       abort "unsupported code point"
138       return 0
139     }
140   }
141   # emit trailer bytes, 6 bits from 'in', first two bits '10'
142   var result/edi: code-point-utf8 <- copy 0
143   {
144     compare num-trailers, 0
145     break-if-<=
146     var tmp/esi: int <- copy c
147     tmp <- and 0x3f
148     tmp <- or 0x80
149     result <- shift-left 8
150     result <- or tmp
151     # update loop state
152     c <- shift-right 6
153     num-trailers <- decrement
154     loop
155   }
156   # emit engine
157   result <- shift-left 8
158   result <- or c
159   result <- or first
160   #
161   return result
162 }
163 
164 # single-byte code point have identical code-point-utf8s
165 fn test-to-utf8-single-byte {
166   var in-int/ecx: int <- copy 0
167   {
168     compare in-int, 0x7f
169     break-if->
170     var in/eax: code-point <- copy in-int
171     var out/eax: code-point-utf8 <- to-utf8 in
172     var out-int/eax: int <- copy out
173     check-ints-equal out-int, in-int, "F - test-to-utf8-single-byte"
174     in-int <- increment
175     loop
176   }
177 }
178 
179                                                               # byte       | byte      | byte      | byte
180 # smallest 2-byte utf-8
181 fn test-to-utf8-two-bytes-min {
182   var in/eax: code-point <- copy 0x80                         #                                 10     00-0000
183   var out/eax: code-point-utf8 <- to-utf8 in
184   var out-int/eax: int <- copy out
185   check-ints-equal out-int, 0x80c2, "F - to-utf8/2a"      #                         110 0-0010  10 00-0000
186 }
187 
188 # largest 2-byte utf-8
189 fn test-to-utf8-two-bytes-max {
190   var in/eax: code-point <- copy 0x7ff                        #                             1-1111     11-1111
191   var out/eax: code-point-utf8 <- to-utf8 in
192   var out-int/eax: int <- copy out
193   check-ints-equal out-int, 0xbfdf, "F - to-utf8/2b"      #                         110 1-1111  10 11-1111
194 }
195 
196 # smallest 3-byte utf-8
197 fn test-to-utf8-three-bytes-min {
198   var in/eax: code-point <- copy 0x800                        #                            10-0000     00-0000
199   var out/eax: code-point-utf8 <- to-utf8 in
200   var out-int/eax: int <- copy out
201   check-ints-equal out-int, 0x80a0e0, "F - to-utf8/3a"    #              1110 0000  10 10-0000  10 00-0000
202 }
203 
204 # largest 3-byte utf-8
205 fn test-to-utf8-three-bytes-max {
206   var in/eax: code-point <- copy 0xffff                       #                   1111     11-1111     11-1111
207   var out/eax: code-point-utf8 <- to-utf8 in
208   var out-int/eax: int <- copy out
209   check-ints-equal out-int, 0xbfbfef, "F - to-utf8/3b"    #              1110 1111  10 11-1111  10 11-1111
210 }
211 
212 # smallest 4-byte utf-8
213 fn test-to-utf8-four-bytes-min {
214   var in/eax: code-point <- copy 0x10000                      #                 1-0000     00-0000     00-0000
215   var out/eax: code-point-utf8 <- to-utf8 in
216   var out-int/eax: int <- copy out
217   check-ints-equal out-int, 0x808090f0, "F - to-utf8/4a"  # 1111-0 000  10 01-0000  10 00-0000  10 00-0000
218 }
219 
220 # largest 4-byte utf-8
221 fn test-to-utf8-four-bytes-max {
222   var in/eax: code-point <- copy 0x1fffff                     #        111     11-1111     11-1111     11-1111
223   var out/eax: code-point-utf8 <- to-utf8 in
224   var out-int/eax: int <- copy out
225   check-ints-equal out-int, 0xbfbfbff7, "F - to-utf8/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
226 }
227 
228 # read the next code-point-utf8 from a stream of bytes
229 fn read-code-point-utf8 in: (addr stream byte) -> _/eax: code-point-utf8 {
230   # if at eof, return EOF
231   {
232     var eof?/eax: boolean <- stream-empty? in
233     compare eof?, 0/false
234     break-if-=
235     return 0xffffffff
236   }
237   var c/eax: byte <- read-byte in
238   var num-trailers/ecx: int <- copy 0
239   $read-code-point-utf8:compute-length: {
240     # single byte: just return it
241     compare c, 0xc0
242     {
243       break-if->=
244       var g/eax: code-point-utf8 <- copy c
245       return g
246     }
247     compare c, 0xfe
248     {
249       break-if-<
250       var g/eax: code-point-utf8 <- copy c
251       return g
252     }
253     # 2 bytes
254     compare c, 0xe0
255     {
256       break-if->=
257       num-trailers <- copy 1
258       break $read-code-point-utf8:compute-length
259     }
260     # 3 bytes
261     compare c, 0xf0
262     {
263       break-if->=
264       num-trailers <- copy 2
265       break $read-code-point-utf8:compute-length
266     }
267     # 4 bytes
268     compare c, 0xf8
269     {
270       break-if->=
271       num-trailers <- copy 3
272       break $read-code-point-utf8:compute-length
273     }
274     abort "utf-8 encodings larger than 4 bytes are not yet supported"
275     return 0
276   }
277   # prepend trailer bytes
278   var result/edi: code-point-utf8 <- copy c
279   var num-byte-shifts/edx: int <- copy 1
280   {
281     compare num-trailers, 0
282     break-if-<=
283     var tmp/eax: byte <- read-byte in
284     var tmp2/eax: int <- copy tmp
285     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
286     result <- or tmp2
287     # update loop state
288     num-byte-shifts <- increment
289     num-trailers <- decrement
290     loop
291   }
292   return result
293 }
294 
295 fn test-read-code-point-utf8 {
296   var s: (stream byte 0x30)
297   var s2/ecx: (addr stream byte) <- address s
298   write s2, "aΒc世d界e"
299   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
300   var n/eax: int <- copy c
301   check-ints-equal n, 0x61, "F - test code-point-utf8/0"
302   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
303   var n/eax: int <- copy c
304   check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test code-point-utf8/1"
305   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
306   var n/eax: int <- copy c
307   check-ints-equal n, 0x63, "F - test code-point-utf8/2"
308   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
309   var n/eax: int <- copy c
310   check-ints-equal n, 0x96b8e4, "F - test code-point-utf8/3"
311   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
312   var n/eax: int <- copy c
313   check-ints-equal n, 0x64, "F - test code-point-utf8/4"
314   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
315   var n/eax: int <- copy c
316   check-ints-equal n, 0x8c95e7, "F - test code-point-utf8/5"
317   var c/eax: code-point-utf8 <- read-code-point-utf8 s2
318   var n/eax: int <- copy c
319   check-ints-equal n, 0x65, "F - test code-point-utf8/6"
320 }
321 
322 fn utf8-length g: code-point-utf8 -> _/edx: int {
323   {
324     compare g, 0xff
325     break-if->
326     return 1
327   }
328   {
329     compare g, 0xffff
330     break-if->
331     return 2
332   }
333   {
334     compare g, 0xffffff
335     break-if->
336     return 3
337   }
338   return 4
339 }
340 
341 # needed because available primitives only shift by a literal/constant number of bits
342 fn shift-left-bytes n: int, k: int -> _/eax: int {
343   var i/ecx: int <- copy 0
344   var result/eax: int <- copy n
345   {
346     compare i, k
347     break-if->=
348     compare i, 4  # only 4 bytes in 32 bits
349     break-if->=
350     result <- shift-left 8
351     i <- increment
352     loop
353   }
354   return result
355 }
356 
357 fn test-shift-left-bytes-0 {
358   var result/eax: int <- shift-left-bytes 1, 0
359   check-ints-equal result, 1, "F - shift-left-bytes 0"
360 }
361 
362 fn test-shift-left-bytes-1 {
363   var result/eax: int <- shift-left-bytes 1, 1
364   check-ints-equal result, 0x100, "F - shift-left-bytes 1"
365 }
366 
367 fn test-shift-left-bytes-2 {
368   var result/eax: int <- shift-left-bytes 1, 2
369   check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
370 }
371 
372 fn test-shift-left-bytes-3 {
373   var result/eax: int <- shift-left-bytes 1, 3
374   check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
375 }
376 
377 fn test-shift-left-bytes-4 {
378   var result/eax: int <- shift-left-bytes 1, 4
379   check-ints-equal result, 0, "F - shift-left-bytes 4"
380 }
381 
382 fn test-shift-left-bytes-5 {
383   var result/eax: int <- shift-left-bytes 1, 5
384   check-ints-equal result, 0, "F - shift-left-bytes >4"
385 }
386 
387 # write a code-point-utf8 to a stream of bytes
388 # this is like write-to-stream, except we skip leading 0 bytes
389 fn write-code-point-utf8 out: (addr stream byte), g: code-point-utf8 {
390 $write-code-point-utf8:body: {
391   var c/eax: int <- copy g
392   append-byte out, c  # first byte is always written
393   c <- shift-right 8
394   compare c, 0
395   break-if-= $write-code-point-utf8:body
396   append-byte out, c
397   c <- shift-right 8
398   compare c, 0
399   break-if-= $write-code-point-utf8:body
400   append-byte out, c
401   c <- shift-right 8
402   compare c, 0
403   break-if-= $write-code-point-utf8:body
404   append-byte out, c
405 }
406 }