Mu - 014literal

https://github.com/akkartik/mu1/blob/master/014literal_string.cc
  1 //: For convenience, some instructions will take literal arrays of characters
  2 //: (text or strings).
  3 //:
  4 //: Instead of quotes, we'll use [] to delimit strings. That'll reduce the
  5 //: need for escaping since we can support nested brackets. And we can also
  6 //: imagine that 'recipe' might one day itself be defined in Mu, doing its own
  7 //: parsing.
  8 
  9 :(scenarios load)
 10 :(scenario string_literal)
 11 def main [
 12   1:address:array:character <- copy [abc def]
 13 ]
 14 +parse:   ingredient: {"abc def": "literal-string"}
 15 
 16 :(scenario string_literal_with_colons)
 17 def main [
 18   1:address:array:character <- copy [abc:def/ghi]
 19 ]
 20 +parse:   ingredient: {"abc:def/ghi": "literal-string"}
 21 
 22 :(before "End Mu Types Initialization")
 23 put(Type_ordinal, "literal-string", 0);
 24 
 25 :(before "End next_word Special-cases")
 26 if (in.peek() == '[') {
 27   string result = slurp_quoted(in);
 28   skip_whitespace_and_comments_but_not_newline(in);
 29   return result;
 30 }
 31 
 32 :(code)
 33 string slurp_quoted(istream& in) {
 34   ostringstream out;
 35   assert(has_data(in));  assert(in.peek() == '[');  out << static_cast<char>(in.get());  // slurp the '['
 36   if (is_code_string(in, out))
 37     slurp_quoted_comment_aware(in, out);
 38   else
 39     slurp_quoted_comment_oblivious(in, out);
 40   return out.str();
 41 }
 42 
 43 // A string is a code string (ignores comments when scanning for matching
 44 // brackets) if it contains a newline at the start before any non-whitespace.
 45 bool is_code_string(istream& in, ostream& out) {
 46   while (has_data(in)) {
 47     char c = in.get();
 48     if (!isspace(c)) {
 49       in.putback(c);
 50       return false;
 51     }
 52     out << c;
 53     if (c == '\n') {
 54       return true;
 55     }
 56   }
 57   return false;
 58 }
 59 
 60 // Read a regular string. Regular strings can only contain other regular
 61 // strings.
 62 void slurp_quoted_comment_oblivious(istream& in, ostream& out) {
 63   int brace_depth = 1;
 64   while (has_data(in)) {
 65     char c = in.get();
 66     if (c == '\\') {
 67       slurp_one_past_backslashes(in, out);
 68       continue;
 69     }
 70     out << c;
 71     if (c == '[') ++brace_depth;
 72     if (c == ']') --brace_depth;
 73     if (brace_depth == 0) break;
 74   }
 75   if (!has_data(in) && brace_depth > 0) {
 76     raise << "unbalanced '['\n" << end();
 77     out.clear();
 78   }
 79 }
 80 
 81 // Read a code string. Code strings can contain either code or regular strings.
 82 void slurp_quoted_comment_aware(istream& in, ostream& out) {
 83   char c;
 84   while (in >> c) {
 85     if (c == '\\') {
 86       slurp_one_past_backslashes(in, out);
 87       continue;
 88     }
 89     if (c == '#') {
 90       out << c;
 91       while (has_data(in) && in.peek() != '\n') out << static_cast<char>(in.get());
 92       continue;
 93     }
 94     if (c == '[') {
 95       in.putback(c);
 96       // recurse
 97       out << slurp_quoted(in);
 98       continue;
 99     }
100     out << c;
101     if (c == ']') return;
102   }
103   raise << "unbalanced '['\n" << end();
104   out.clear();
105 }
106 
107 :(after "Parsing reagent(string s)")
108 if (starts_with(s, "[")) {
109   if (*s.rbegin() != ']') return;  // unbalanced bracket; handled elsewhere
110   name = s;
111   // delete [] delimiters
112   name.erase(0, 1);
113   strip_last(name);
114   type = new type_tree("literal-string", 0);
115   return;
116 }
117 
118 //: Unlike other reagents, escape newlines in literal strings to make them
119 //: more friendly to trace().
120 
121 :(after "string to_string(const reagent& r)")
122   if (is_literal_text(r))
123     return emit_literal_string(r.name);
124 
125 :(code)
126 bool is_literal_text(const reagent& x) {
127   return x.type && x.type->name == "literal-string";
128 }
129 
130 string emit_literal_string(string name) {
131   size_t pos = 0;
132   while (pos != string::npos)
133     pos = replace(name, "\n", "\\n", pos);
134   return "{\""+name+"\": \"literal-string\"}";
135 }
136 
137 size_t replace(string& str, const string& from, const string& to, size_t n) {
138   size_t result = str.find(from, n);
139   if (result != string::npos)
140     str.replace(result, from.length(), to);
141   return result;
142 }
143 
144 void strip_last(string& s) {
145   if (!s.empty()) s.erase(SIZE(s)-1);
146 }
147 
148 void slurp_one_past_backslashes(istream& in, ostream& out) {
149   // When you encounter a backslash, strip it out and pass through any
150   // following run of backslashes. If we 'escaped' a single following
151   // character, then the character '\' would be:
152   //   '\\' escaped once
153   //   '\\\\' escaped twice
154   //   '\\\\\\\\' escaped thrice (8 backslashes)
155   // ..and so on. With our approach it'll be:
156   //   '\\' escaped once
157   //   '\\\' escaped twice
158   //   '\\\\' escaped thrice
159   // This only works as long as backslashes aren't also overloaded to create
160   // special characters. So Mu doesn't follow C's approach of overloading
161   // backslashes both to escape quote characters and also as a notation for
162   // unprintable characters like '\n'.
163   while (has_data(in)) {
164     char c = in.get();
165     out << c;
166     if (c != '\\') break;
167   }
168 }
169 
170 :(scenario string_literal_nested)
171 def main [
172   1:address:array:character <- copy [abc [def]]
173 ]
174 +parse:   ingredient: {"abc [def]": "literal-string"}
175 
176 :(scenario string_literal_escaped)
177 def main [
178   1:address:array:character <- copy [abc \[def]
179 ]
180 +parse:   ingredient: {"abc [def": "literal-string"}
181 
182 :(scenario string_literal_escaped_twice)
183 def main [
184   1:address:array:character <- copy [
185 abc \\[def]
186 ]
187 +parse:   ingredient: {"\nabc \[def": "literal-string"}
188 
189 :(scenario string_literal_and_comment)
190 def main [
191   1:address:array:character <- copy [abc]  # comment
192 ]
193 +parse: --- defining main
194 +parse: instruction: copy
195 +parse:   number of ingredients: 1
196 +parse:   ingredient: {"abc": "literal-string"}
197 +parse:   product: {1: ("address" "array" "character")}
198 
199 :(scenario string_literal_escapes_newlines_in_trace)
200 def main [
201   copy [abc
202 def]
203 ]
204 +parse:   ingredient: {"abc\ndef": "literal-string"}
205 
206 :(scenario string_literal_can_skip_past_comments)
207 def main [
208   copy [
209     # ']' inside comment
210     bar
211   ]
212 ]
213 +parse:   ingredient: {"\n    # ']' inside comment\n    bar\n  ": "literal-string"}
214 
215 :(scenario string_literal_empty)
216 def main [
217   copy []
218 ]
219 +parse:   ingredient: {"": "literal-string"}
220 
221 :(scenario multiple_unfinished_recipes)
222 % Hide_errors = true;
223 def f1 [
224 def f2 [
225 +error: unbalanced '['