iopipe.json.parser source code

1 /**
2 Copyright: Copyright Steven Schveighoffer 2017
3 License:   Boost License 1.0. (See accompanying file LICENSE_1_0.txt or copy at
4            http://www.boost.org/LICENSE_1_0.txt)
5 Authors: Steven Schveighoffer
6 */
7 module iopipe.json.parser;
8 import iopipe.traits;
9 import iopipe.bufpipe;
10 import std.range.primitives;
11 import std.traits;
12 
13 /**
14  * Tokens as parsed from the stream. This indicates what the next token is
15  * supposed to be, and doesn't necessarily validate the next item is in the
16  * correct format.
17  */
18 enum JSONToken : char
19 {
20     ObjectStart = '{', /// {
21     ObjectEnd = '}',   /// }
22     String = '"',      /// "
23     Colon = ':',       /// :
24     Comma = ',',       /// ,
25     ArrayStart = '[',  /// [
26     ArrayEnd = ']',    /// ]
27     Number = '0',      /// - or 0-9
28     True = 't',        /// t
29     False = 'f',       /// f
30     Null = 'n',        /// n
31     EOF = '\0',         /// end of stream
32     Error = 0xff,       /// unexpected data in stream
33 }
34 
35 /**
36  * Hint on how to parse this value. If the item is a Number or String, then
37  * this gives hints on how to parse it. It's a bitfield, with the first bit
38  * defining integer or float, the second bit defining 
39  */
40 enum JSONParseHint : ubyte
41 {
42     InPlace, /// Item is not a value, or is a string that can be used in place.
43     Int,     /// Item is integral (no decimal or exponent).
44     Float,   /// number has decimal place, but no exponent
45     Exp,     /// number has exponent (and is float).
46     Escapes, /// string has escapes
47 }
48 
49 /**
50  * Returns: `true` if the token can be used in place of a "value". Useful for
51  * validation.
52  */
53 bool isValue(JSONToken token) pure @safe nothrow
54 {
55     switch(token) with(JSONToken)
56     {
57     case ObjectStart:
58     case ArrayStart:
59     case String:
60     case Number:
61     case True:
62     case False:
63     case Null:
64         return true;
65     default:
66         return false;
67     }
68 }
69 
70 /**
71  * Search for the next token in the iopipe c, ignoring white space. This does
72  * not validate data, it simply searches for the beginning of a valid JSON
73  * token. Since each token is definitive based on the first character, only the
74  * first character is looked at.
75  *
76  * Params:
77  *    c = The iopipe in which to search for tokens. extend may be used on the
78  *    iopipe to find the next token.
79  *    pos = Current position in the window. Taken via ref so it can be updated
80  *    to point at the new token.
81  * Returns: The expected type of token at the new position.
82  */
83 JSONToken jsonTok(Chain)(ref Chain c, ref size_t pos) if (isIopipe!Chain && isSomeChar!(ElementEncodingType!(WindowType!Chain)))
84 {
85     import std.ascii: isWhite;
86     // strip any leading whitespace. If no data is left, we need to extend
87     while(true)
88     {
89         while(pos < c.window.length && isWhite(c.window[pos]))
90             ++pos;
91         if(pos < c.window.length)
92             break;
93         if(c.extend(0) == 0)
94             return JSONToken.EOF;
95     }
96 
97     immutable cur = c.window[pos];
98     switch(cur) with(JSONToken)
99     {
100     case ObjectStart:
101     case ObjectEnd:
102     case String:
103     case Colon:
104     case Comma:
105     case ArrayStart:
106     case ArrayEnd:
107     case True:
108     case False:
109     case Null:
110         return cast(JSONToken)cur;
111     case '-':
112     case '0': .. case '9':
113         return Number;
114     default:
115         return Error;
116     }
117 }
118 
119 /**
120  * JSON item from a specific iopipe. This is like a slice into the iopipe, but
121  * only contains offsets so as to make it easy to manipulate. No whitespace is
122  * included. Strings do not include the surrounding quotes.
123  */
124 struct JSONItem
125 {
126     /**
127      * If the token is a standard token, offset into the current iopipe window
128      * of where this item begins. If you release data from the beginning, this
129      * can be updated manually by subtracting the number of items released.
130      */
131     size_t offset;
132 
133     /**
134      * Length of the item.
135      */
136     size_t length; // length of the item in the stream.
137 
138     /**
139      * The type of token this item contains.
140      */
141     JSONToken token;
142 
143     /**
144      * A parsing hint on what is inside the item. This is determined for
145      * Strings or Numbers during validation. Other token types do not set this
146      * member to anything.
147      */
148     JSONParseHint hint;
149 
150     /**
151      * Given an iopipe from which this token came, returns the exact window
152      * data for the item.
153      */
154     auto data(Chain)(ref Chain c)
155     {
156         return c.window[offset .. offset + length];
157     }
158 }
159 
160 /**
161  * Parse and validate a string from an iopipe. This functions serves several
162  * purposes. First, it determines how long the string actually is, so it can be
163  * properly parsed out. Second, it verifies any escapes inside the string.
164  * Third, if requested, it can replace any escapes with the actual characters
165  * in-place, so the string result can just be used. Note that this does not
166  * verify any UTF code units are valid. However, any unicode escapes using the
167  * `\uXXXX` sequence are validated.
168  * 
169  * Params:
170  *     replaceEscapes = If true, then any encountered escapes will be replaced with the actual utf characters they represent, properly encoded for the given element type.
171  *     c = The iopipe to parse the string from. If the end of the string is not
172  *     present in the current window, it will be extended until the end is
173  *     found.
174  *     pos = Upon calling, this should be the position in the stream window
175  *     where the first quotation mark is for this string. Upon exit, if
176  *     successfully parsed, pos is updated to the position just after the final
177  *     quotation mark. If an error occurs, pos should be at the spot where the
178  *     invalid sequence occurred.
179  *     hint = Set to `InPlace` if the string no longer contains escapes (either
180  *     there weren't any, or they have been replaced). Set to `Escapes` if the
181  *     string still contains escapes that need to be properly parsed.
182  * Returns: number of elements in the resulting string if successfully parsed,
183  * or -1 if there is an error. Note that if escapes are not replaced, then this
184  * number includes the escape character sequences as-is.
185  */
186 int parseString(bool replaceEscapes = true, Chain)(ref Chain c, ref size_t pos, ref JSONParseHint hint)
187 {
188     hint = JSONParseHint.InPlace;
189     // the first character must be a quote
190     auto src = c.window;
191     if(src.length == 0 || src[pos] != '"')
192         return -1;
193     ++pos;
194 
195     immutable origPos = pos;
196     static if(replaceEscapes)
197         auto targetPos = pos;
198     bool isEscaped = false;
199     wchar surrogate;
200     while(true)
201     {
202         if(pos == src.length)
203         {
204             // need more data from the pipe
205             if(c.extend(0) == 0)
206                 // EOF.
207                 return -1;
208             src = c.window;
209         }
210         auto elem = src[pos];
211         if(isEscaped)
212         {
213             isEscaped = false;
214             if(elem == 'u') // unicode sequence. 
215             {
216                 // ensure there are at least 4 characters available.
217                 ++pos;
218                 if(pos + 4 > src.length)
219                 {
220                     c.ensureElems(pos + 4);
221                     // may need to re-assign src.
222                     src = c.window;
223                     if(pos + 4 > src.length)
224                     {
225                         // invalid sequence.
226                         pos = src.length;
227                         return -1;
228                     }
229                 }
230 
231                 // parse the hex chars
232                 import std.conv: parse;
233                 auto chars = src[pos .. pos + 4];
234 
235                 wchar value = parse!ushort(chars, 16);
236                 pos += 4;
237                 if(chars.length)
238                 {
239                     // some characters not proper hex
240                     pos -= chars.length;
241                     return -1;
242                 }
243                 alias Char = typeof(src[0]);
244 
245                 static if(replaceEscapes)
246                 {
247                     // function to encode a dchar into the target stream.
248                     void enc(dchar d)
249                     {
250                         // insert the given dchar into the stream
251                         static if(is(Char == dchar))
252                         {
253                             src[targetPos++] = d;
254                         }
255                         else static if(is(Char == wchar))
256                         {
257                             // this only happens if we have a dchar cast
258                             // from a non-surrogate wchar. So cheat and just
259                             // copy it.
260                             src[targetPos++] = cast(wchar)d;
261                         }
262                         else // char
263                         {
264                             // potentially need to encode it. Most of the
265                             // time, anyone using the \u escape sequence is
266                             // not going to be encoding ascii data. So
267                             // don't worry about that shortcut.
268                             import std.utf : encode;
269                             char[4] data;
270                             foreach(i; 0 .. encode(data, d))
271                                 src[targetPos++] = data[i];
272                         }
273                     }
274                 }
275 
276                 // if we have a surrogate pair cached from the last
277                 // element parsed, then this must be the matching pair.
278                 if(surrogate != wchar.init)
279                 {
280                     // need to parse out this into a dchar. First,
281                     // determine that they match.
282                     if(value < 0xdc00 || value > 0xdfff)
283                         // invalid sequence
284                         return -1;
285 
286                     static if(replaceEscapes)
287                     {
288                         // valid sequence, put it into the stream.
289                         static if(is(Char == wchar))
290                         {
291                             // just copy the two surrogates to the stream
292                             src[targetPos++] = surrogate;
293                             src[targetPos++] = value;
294                         }
295                         else
296                         {
297                             // convert to dchar
298                             dchar converted = ((surrogate & 0x3ff) << 10) + (value & 0x3ff);
299                             enc(converted);
300                         }
301                     }
302                     // reset the surrogate pair
303                     surrogate = wchar.init;
304                 }
305                 else
306                 {
307                     if(value >= 0xd800 && value <= 0xdbff)
308                     {
309                         // this is the first half of a surrogate pair
310                         surrogate = value;
311                     }
312                     else
313                     {
314                         if(value >= 0xdc00 && value <= 0xdfff)
315                         {
316                             // second surrogate pair, but we didn't get
317                             // a first one. Error.
318                             return -1;
319                         }
320                         // need to encode this into the stream
321                         static if(replaceEscapes)
322                             enc(value);
323                     }
324                 }
325             }
326             else
327             {
328                 static if(replaceEscapes)
329                 {
330                     switch(elem)
331                     {
332                     case '\\':
333                     case '/':
334                     case '"':
335                         src[targetPos++] = elem;
336                         break;
337                     case 'n':
338                         src[targetPos++] = '\n';
339                         break;
340                     case 'b':
341                         src[targetPos++] = '\b';
342                         break;
343                     case 'f':
344                         src[targetPos++] = '\f';
345                         break;
346                     case 'r':
347                         src[targetPos++] = '\r';
348                         break;
349                     case 't':
350                         src[targetPos++] = '\t';
351                         break;
352                     default:
353                         // unknown escape
354                         return -1;
355                     }
356                 }
357                 else
358                 {
359                     // just make sure it's a valid escape character
360                     switch(elem)
361                     {
362                     case '\\': case '/': case'"': case 'n':
363                     case 'b': case 'f': case 'r': case 't':
364                         break;
365                     default:
366                         return -1;
367                     }
368                 }
369                 ++pos;
370             }
371         }
372         else if(elem == '\\')
373         {
374             static if(!replaceEscapes)
375                 hint = JSONParseHint.Escapes;
376             isEscaped = true;
377             ++pos;
378         }
379         else if(surrogate != wchar.init)
380         {
381             // we were expecting another surrogate pair, error.
382             return -1; 
383         }
384         else if(elem == '"')
385         {
386             // finished
387             ++pos;
388             static if(replaceEscapes)
389                 return cast(int)(targetPos - origPos);
390             else
391                 return cast(int)(pos - origPos - 1);
392         }
393         else
394         {
395             static if(replaceEscapes)
396             {
397                 // simple copy
398                 if(targetPos != pos)
399                     src[targetPos] = elem;
400                 ++targetPos;
401             }
402             ++pos;
403         }
404     }
405 }
406 
407 unittest
408 {
409     void testParse(bool replaceEscape, C)(C[] jsonString, bool shouldFail, JSONParseHint expectedHint = JSONParseHint.InPlace, int expectedResult = -1, const(C)[] expectedString = null)
410     {
411         size_t pos;
412         JSONParseHint hint;
413         if(expectedString == null)
414             expectedString = jsonString[1 .. $-1].dup;
415         auto result = parseString!replaceEscape(jsonString, pos, hint);
416         if(shouldFail)
417         {
418             assert(result == -1, jsonString);
419         }
420         else
421         {
422             assert(result == (expectedResult < 0 ? jsonString.length - 2 : expectedResult), jsonString);
423             assert(pos == jsonString.length, jsonString);
424             assert(hint == expectedHint, jsonString);
425             assert(jsonString[1 .. 1 + result] == expectedString, jsonString);
426         }
427     }
428 
429     testParse!false(q"{"abcdef"}", false);
430     testParse!false(q"{"abcdef}", true);
431     testParse!true(q"{"abcdef"}".dup, false);
432     testParse!true(q"{"abcdef\n"}".dup, false, JSONParseHint.InPlace, 7, "abcdef\n");
433     testParse!true(q"{"abcdef\ua123\n"}".dup, false, JSONParseHint.InPlace, 10, "abcdef\ua123\n");
434     testParse!false(q"{"abcdef\ua123\n"}", false, JSONParseHint.Escapes);
435 }
436 
437 /**
438  * Parse/validate a number from the given iopipe. This is used to validate the
439  * number follows the correct grammar from the JSON spec, and also to find out
440  * how many elements in the stream are used for this number.
441  *
442  * Params:
443  *     c = The iopipe the number is being parsed from.
444  *     pos = Upon calling, the position in the iopipe window where this number
445  *     should start. Upon exit, if successfully parsed, this is the position
446  *     after the last number element. If there was a parsing error, this is the
447  *     position where the parsing error occurred.
448  *     hint = Indicates upon return whether this number is integral, floating
449  *     point, or a floating point with exponent. This can be used to parse the
450  *     correct type using standard parsing techniques. Note that no attempt is
451  *     made to verify the number will fit within, or can be properly
452  *     represented by any type.
453  *
454  * Returns: The number of elements in the iopipe that comprise this number, or
455  * -1 if there was a parsing error.
456  */
457 int parseNumber(Chain)(ref Chain c, ref size_t pos, ref JSONParseHint hint)
458 {
459     auto src = c.window;
460     immutable origPos = pos;
461     enum state
462     {
463         begin,
464         sign,
465         leadingzero,
466         anydigit1,
467         decimal,
468         anydigit2,
469         exponent,
470         expsign,
471         anydigit3,
472     }
473     hint = JSONParseHint.Int;
474 
475     state s;
476     while(true)
477     {
478         if(pos == src.length)
479         {
480             // need more data from the pipe
481             if(c.extend(0) == 0) with(state)
482             {
483                 // end of the item. However, not necessarily an error. Make
484                 // sure we are in a state that allows ending the number.
485                 if(s == leadingzero || s == anydigit1 || s == anydigit2 || s == anydigit3)
486                     return cast(int)(pos - origPos); // finished.
487                 // error otherwise, the number isn't complete.
488                 return -1;
489             }
490             src = c.window;
491         }
492         auto elem = src[pos];
493         final switch(s) with(state)
494         {
495         case begin:
496             // only accept sign or digit
497             if(elem == '-')
498             {
499                 s = sign;
500                 break;
501             }
502             goto case sign;
503         case sign:
504             if(elem == '0')
505                 s = leadingzero;
506             else if(elem >= '1' && elem <= '9')
507                 s = anydigit1;
508             else
509                 // error
510                 return -1;
511             break;
512         case leadingzero:
513             if(elem == '.')
514             {
515                 hint = JSONParseHint.Float;
516                 s = decimal;
517             }
518             else if(elem == 'e' || elem == 'E')
519             {
520                 hint = JSONParseHint.Exp;
521                 s = exponent;
522             }
523             else 
524                 return cast(int)(pos - origPos); // finished
525             break;
526         case anydigit1:
527             if(elem >= '0' && elem <= '9')
528                 // stay in this state
529                 break;
530             goto case leadingzero;
531         case decimal:
532             if(elem >= '0' && elem <= '9')
533                 s = anydigit2;
534             else
535                 // error
536                 return -1;
537             break;
538         case anydigit2:
539             if(elem >= '0' && elem <= '9')
540                 break;
541             else if(elem == 'e' || elem == 'E')
542             {
543                 hint = JSONParseHint.Exp;
544                 s = exponent;
545             }
546             else
547                 return cast(int)(pos - origPos); // finished
548             break;
549         case exponent:
550             if(elem == '+' || elem == '-')
551             {
552                 s = expsign;
553                 break;
554             }
555             goto case expsign;
556         case expsign:
557             if(elem >= '0' && elem <= '9')
558                 s = anydigit3;
559             else
560                 // error
561                 return -1;
562             break;
563         case anydigit3:
564             if(elem >= '0' && elem <= '9')
565                 break;
566             else
567                 return cast(int)(pos - origPos); // finished
568         }
569         ++pos;
570     }
571     
572     // all returns should happen in the infinite loop.
573     assert(0);
574 }
575 
576 unittest
577 {
578     void testParse(string jsonString, bool shouldFail, JSONParseHint expectedHint = JSONParseHint.Int)
579     {
580         size_t pos;
581         JSONParseHint hint;
582         auto result = parseNumber(jsonString, pos, hint);
583         if(shouldFail)
584         {
585             assert(result == -1, jsonString);
586         }
587         else
588         {
589             assert(result == jsonString.length, jsonString);
590             assert(pos == jsonString.length, jsonString);
591             assert(hint == expectedHint, jsonString);
592         }
593     }
594     testParse("e1", true);
595     testParse("0", false);
596     testParse("12345", false);
597     testParse("100.0", false, JSONParseHint.Float);
598     testParse("0.1e-1", false, JSONParseHint.Exp);
599     testParse("-0.1e-1", false, JSONParseHint.Exp);
600     testParse("-.1e-1", true);
601     testParse("123.", true);
602     testParse("--123", true);
603     testParse(".1", true);
604     testParse("0.1e", true);
605 }
606 
607 /**
608  * Obtain one parsing item from the given iopipe. This has no notion of
609  * context, so it does not actually validate the overall structure of the JSON
610  * stream. It only confirms that the next item is a valid JSON item.
611  *
612  * Params:
613  *     replaceEscapes = Boolean passed to string parser to specify how escapes
614  *     should be handled. See parseString for details.
615  *     c = iopipe from which to parse item. If needed, it may be extended.
616  *     pos = Current position in the iopipe's window from which the next item
617  *     should start. Leading whitespace is allowed.
618  *
619  * Returns: If the stream contains a valid JSON item, the details about that
620  * item are returned. If the stream does not contain any more items, then EOF
621  * is returned. If there is an error parsing data from the stream for any
622  * reason, then Error is returned.
623  *
624  */
625 JSONItem jsonItem(bool replaceEscapes = true, Chain)(ref Chain c, ref size_t pos)
626 {
627     // parse a json item out of the chain
628     JSONItem result;
629     result.token = jsonTok(c, pos);
630     result.offset = pos;
631 
632     void validateToken(string expected)
633     {
634         if(pos + expected.length > c.window.length)
635         {
636             // need to extend
637             c.ensureElems(pos + expected.length);
638         }
639 
640         auto w = c.window[pos .. $];
641 
642         if(expected.length > w.length)
643         {
644             // error, cannot be valid json.
645             result.offset = c.window.length;
646             result.token = JSONToken.Error;
647             return;
648         }
649 
650         // can't use std.algorithm.equal here, because of autodecoding...
651         foreach(i, c; expected)
652         {
653             if(w[i] != c)
654             {
655                 // doesn't match
656                 result.offset = pos + i;
657                 result.token = JSONToken.Error;
658                 return;
659             }
660         }
661 
662         result.length = expected.length;
663         pos += expected.length;
664     }
665 
666     final switch(result.token) with (JSONToken)
667     {
668     case ObjectStart:
669     case ObjectEnd:
670     case Colon:
671     case Comma:
672     case ArrayStart:
673     case ArrayEnd:
674         result.length = 1;
675         ++pos; // skip over the single character item
676         break;
677     case EOF:
678     case Error:
679         break; // no changes to result needed.
680     case True:
681         validateToken("true");
682         break;
683     case False:
684         validateToken("false");
685         break;
686     case Null:
687         validateToken("null");
688         break;
689     case String:
690         // string
691         {
692             auto numChars = parseString!replaceEscapes(c, pos, result.hint);
693             if(numChars < 0)
694             {
695                 result.token = Error;
696                 result.length = pos - result.offset;
697             }
698             else
699             {
700                 // skip over initial quote
701                 result.offset++;
702                 result.length = numChars;
703             }
704         }
705         break;
706     case Number:
707         // ensure the number is correct.
708         {
709             auto numChars = parseNumber(c, pos, result.hint);
710             if(numChars < 0)
711             {
712                 result.token = Error;
713                 result.length = pos - result.offset;
714             }
715             else
716             {
717                 result.length = numChars;
718             }
719         }
720         break;
721     }
722     return result;
723 }
724 
725 /**
726  * An object used to parse JSON items from a given iopipe chain. As the items
727  * are parsed, the structure of the JSON data is validated. Note that the data
728  * returned is simply references to within the iopipe window.
729  *
730  * Each new item/token can be obtained by calling the `next` method.
731  */
732 struct JSONTokenizer(Chain, bool replaceEscapes)
733 {
734     import std.bitmanip : BitArray;
735 
736     /**
737      * The iopipe source. Use this to parse the data returned. Do not call
738      * chain.release directly, use the release method instead to make sure the
739      * internal state is maintained.
740      */
741     Chain chain;
742 
743     private
744     {
745         private enum State : ubyte
746         {
747             Begin,  // next item should be either an Object or Array
748             First,  // Just started a new object or array.
749             Member, // Expect next member (name for object, value for array_
750             Colon,  // Expect colon (Object only)
751             Value,  // Expect value
752             Comma,  // Expect comma or end of collection.
753             End     // there shouldn't be any more items
754         }
755 
756         // bit array indicates structure of JSON parser (nesting).
757         // 0 = array, 1 = object
758         BitArray stack;
759         size_t stackLen;
760         size_t pos;
761         State state;
762         bool inObj()
763         {
764             return stackLen == 0 ? false : stack[stackLen - 1];
765         }
766 
767         void pushContainer(bool isObj)
768         {
769             if(stackLen == stack.length)
770                 stack ~= isObj;
771             else
772                 stack[stackLen] = isObj;
773             ++stackLen;
774         }
775 
776         void popContainer()
777         {
778             state = (--stackLen == 0) ? State.End : State.Comma;
779         }
780 
781         // caching items allows us to parse only once, yet review the items later.
782         bool caching;
783         JSONItem[] cache;
784         size_t cIdx;
785     }
786 
787     @property bool finished()
788     {
789         return state == State.End;
790     }
791 
792     // start caching elements. When this is enabled, rewind will jump back to
793     // the first element and replay from the cache instead of parsing. Make
794     // sure to call endCache when you are done with the replay cache.
795     void startCache()
796     {
797         caching = true;
798         if(cIdx != 0)
799         {
800             // remove all the elements before the current one, or else the
801             // rewind command will not work right.
802             import std.algorithm.mutation : copy;
803             copy(cache[cIdx .. $], cache[0 .. $ - cIdx]);
804             cache = cache[0 .. $ - cIdx];
805             cache.assumeSafeAppend;
806             cIdx = 0;
807         }
808     }
809 
810     /**
811      * stop caching elements (the cache will be freed when no longer needed)
812      */
813     void endCache()
814     {
815         caching = false;
816         if(cIdx == cache.length)
817         {
818             // deallocate the cache.
819             cache.length = 0;
820             cache.assumeSafeAppend;
821             cIdx = 0;
822         }
823     }
824 
825     // this specialized function will skip the current item, taking into
826     // account the nested nature of JSON. The return value is the next JSONItem
827     // after the skipped data.
828     //
829     // If at the beginning of the JSON stream, the entire JSON stream is parsed
830     // until the end of the JSON data in the stream.
831     //
832     // If at a member name, colon, or value expected, the entire member is skipped.
833     //
834     // If at a comma, the comma is skipped.
835     //
836     // If an error is encountered, it is returned immediately
837     //
838     JSONItem skipItem()
839     {
840         size_t depth = 0;
841         // parse until we see the stack length get less than our current depth,
842         // until we see a comma, error, or end of stream.
843         while(true)
844         {
845             auto item = next();
846             with(JSONToken) switch(item.token)
847             {
848             case ObjectStart:
849             case ArrayStart:
850                 ++depth;
851                 break;
852             case ObjectEnd:
853             case ArrayEnd:
854                 if(!depth)
855                     // at the end of the current object
856                     return item;
857                 --depth;
858                 break;
859             case Comma:
860                 if(depth == 0)
861                     return item;
862                 break;
863             case Error:
864             case EOF:
865                 return item;
866             default:
867                 // everything else we ignore
868                 break;
869             }
870         }
871     }
872 
873     // Parse until it finds a specific member/submember. The assumptino is that
874     // the current item is an object start.
875     //
876     // Returns true if the specified submember was found, and the parser is
877     // queued to parse the value of that member.
878     //
879     // Returns false if the object was searched, but the submember could not be
880     // found.
881     //
882     // Also returns false if this is not an object.
883     bool parseTo(string[] submember...)
884     {
885         import std.algorithm : equal;
886         while(submember.length > 0)
887         {
888             // jump into the first member.
889             if(peek != JSONToken.ObjectStart)
890                 return false;
891             cast(void)next;
892             if(peek != JSONToken.String)
893                 return false;
894             auto item = next;
895             while(!item.data(chain).equal(submember[0]))
896             {
897                 item = skipItem();
898                 if(item.token == JSONToken.ObjectEnd)
899                     return false;
900                 else if(item.token == JSONToken.Comma)
901                     item = next;
902                 else
903                     // something went wrong.
904                     return false;
905             }
906             // found the item
907             if(peek != JSONToken.Colon)
908                 return false;
909             item = next;
910             submember = submember[1 .. $];
911         }
912 
913         return true;
914     }
915 
916     // where are we in the buffer
917     @property size_t position()
918     {
919         if(cIdx < cache.length)
920             return cache[cIdx].offset;
921         return pos;
922     }
923 
924     /**
925      * Obtain the next JSONItem from the stream.
926      */
927     JSONItem next()
928     {
929         if(cIdx < cache.length)
930         {
931             auto item = cache[cIdx++];
932             if(cIdx == cache.length && !caching)
933             {
934                 // done with the cache
935                 cache.length = 0;
936                 cache.assumeSafeAppend;
937                 cIdx = 0;
938             }
939             return item;
940         }
941 
942         if(state == State.End)
943             // return an EOF item, even if the stream is not done.
944             return JSONItem(pos, 0, JSONToken.EOF);
945 
946         // else, not cached, parse item from the chain.
947         auto item = chain.jsonItem!replaceEscapes(pos);
948 
949         final switch(state) with(JSONToken)
950         {
951         case State.Begin:
952             // item needs to be an ObjectStart or ArrayStart
953             if(item.token == ObjectStart || item.token == ArrayStart)
954             {
955                 state = State.First;
956                 pushContainer(item.token == ObjectStart);
957             }
958             else
959                 item.token = Error;
960             break;
961         case State.First:
962             // allow ending of the container
963             if(item.token == (inObj ? ObjectEnd : ArrayEnd))
964             {
965                 popContainer();
966                 break;
967             }
968             goto case State.Member;
969         case State.Member:
970             if(inObj)
971             {
972                 if(item.token == String)
973                     state = State.Colon;
974                 else
975                     item.token = Error;
976                 break;
977             }
978             goto case State.Value;
979         case State.Colon:
980             // requires colon
981             if(item.token == Colon)
982                 state = State.Value;
983             else
984                 item.token = Error;
985             break;
986         case State.Value:
987             if(item.token.isValue)
988             {
989                 if(item.token == ObjectStart || item.token == ArrayStart)
990                 {
991                     pushContainer(item.token == ObjectStart);
992                     state = State.First;
993                 }
994                 else
995                     state = State.Comma;
996             }
997             else
998                 item.token = Error;
999             break;
1000         case State.Comma:
1001             // can end the object here, or get a comma
1002             if(item.token == (inObj ? ObjectEnd : ArrayEnd))
1003                 popContainer();
1004             else if(item.token == Comma)
1005                 state = State.Member;
1006             else
1007                 item.token = Error;
1008             break;
1009         case State.End:
1010             // this is handled outside the switch statement
1011             assert(0);
1012         }
1013 
1014         if(caching)
1015         {
1016             cache ~= item;
1017             ++cIdx;
1018         }
1019         return item;
1020     }
1021 
1022     void rewind()
1023     {
1024         assert(caching);
1025         cIdx = 0;
1026     }
1027 
1028     /**
1029      * Peek at the input stream and see what's coming next.
1030      */
1031     JSONToken peek()
1032     {
1033         if(cIdx < cache.length)
1034             return cache[cIdx].token;
1035         return chain.jsonTok(pos);
1036     }
1037 
1038     /**
1039      * Release the given number of stream elements from the stream.
1040      * Note: you are only allowed to release elements that are ALREADY parsed.
1041      *
1042      * Params: elements = the number of code units to release from the stream.
1043      */
1044     void release(size_t elements)
1045     {
1046         // not compatible while we are caching. You can still have a cache, but
1047         // the caching needs to be turned off.
1048         assert(!caching);
1049 
1050         // release items from the chain window.
1051         assert(position >= elements);
1052         chain.release(elements);
1053         pos -= elements;
1054 
1055         // update the cache if it exists
1056         if(cache.length > 0)
1057         {
1058             size_t toRemove = 0;
1059             foreach(ref ci; cache)
1060             {
1061                 if(ci.offset < elements)
1062                     ++toRemove;
1063                 else
1064                     ci.offset -= elements;
1065             }
1066             if(toRemove > 0)
1067             {
1068                 // we shouldn't be removing any elements still in use.
1069                 assert(toRemove <= cIdx);
1070 
1071                 import std.algorithm.mutation : copy;
1072                 auto validElems = cache.length - toRemove;
1073                 copy(cache[toRemove .. $], cache[0 .. validElems]);
1074                 cache = cache[0 .. validElems];
1075                 cache.assumeSafeAppend;
1076                 cIdx -= toRemove;
1077             }
1078         }
1079     }
1080 
1081 
1082     /**
1083      * Release all elements that have been parsed completely. The return value
1084      * is the number of elements that were released. Note that this can be done
1085      * at any point, it doesn't matter if the parser is partway through an
1086      * object, the rest of that object can still be parsed.
1087      *
1088      * The goal is to free up buffer space for more incoming data.  It is
1089      * better to call this function than release directly if you just want to
1090      * free up parsed data.
1091      *
1092      * Note that any string elements that refer to the buffer are invalidated,
1093      * since the buffer space will be gone.
1094      *
1095      * Returns: The number of elements that were released.
1096      */
1097     size_t releaseParsed()
1098     {
1099         auto result = position;
1100         if(result)
1101             release(result);
1102         return result;
1103     }
1104 }
1105 
1106 /**
1107  * Wrap a text iopipe into a JSONParser struct. 
1108  */
1109 auto jsonTokenizer(bool replaceEscapes = true, Chain)(Chain c)
1110 {
1111     return JSONTokenizer!(Chain, replaceEscapes)(c);
1112 }
1113 
1114 unittest
1115 {
1116     with(JSONToken)
1117     {
1118         import std.typecons: Tuple, tuple;
1119         alias Check = Tuple!(JSONToken, string);
1120         void verifyJson(bool replaceEscapes, C)(C[] jsonData, Check[] verifyAgainst)
1121         {
1122             // use a simple pipe to simulate not having all the data available at once.
1123             auto pipeAdapter = SimplePipe!(C[])(jsonData);
1124             auto parser = jsonTokenizer!replaceEscapes(pipeAdapter);
1125             JSONItem[] items;
1126             while(true)
1127             {
1128                 auto item = parser.next;
1129                 items ~= item;
1130                 if(item.token == EOF || item.token == Error)
1131                     break;
1132             }
1133 
1134             assert(items.length == verifyAgainst.length);
1135             if(items[$-1].token == EOF)
1136                 assert(parser.pos == jsonData.length);
1137             foreach(idx, item; items)
1138             {
1139                 assert(item.token == verifyAgainst[idx][0]);
1140                 auto expected = verifyAgainst[idx][1];
1141                 import std.algorithm.comparison: equal;
1142                 import std.format: format;
1143                 assert(equal(item.data(jsonData), expected), format("(C = %s, replace = %s, curdata = %s) %s != %s", C.stringof, replaceEscapes, jsonData[0 .. parser.pos], item.data(jsonData), expected));
1144             }
1145         }
1146         auto jsonData = q"{   {
1147             "abc" : 123.456,
1148                 "def": [1,0.5, 8e10, "hi", "\r\n\f\b\u0025", {}, true, false, null] }}";
1149         auto checkitems = [ 
1150             Check(ObjectStart, "{"),
1151             Check(String, "abc"),
1152             Check(Colon, ":"),
1153             Check(Number, "123.456"),
1154             Check(Comma, ","),
1155             Check(String, "def"),
1156             Check(Colon, ":"),
1157             Check(ArrayStart, "["),
1158             Check(Number, "1"),
1159             Check(Comma, ","),
1160             Check(Number, "0.5"),
1161             Check(Comma, ","),
1162             Check(Number, "8e10"),
1163             Check(Comma, ","),
1164             Check(String, "hi"),
1165             Check(Comma, ","),
1166             Check(String, "\\r\\n\\f\\b\\u0025")];
1167         auto replaceItem = checkitems.length - 1;
1168         checkitems ~= [
1169             Check(Comma, ","),
1170             Check(ObjectStart, "{"),
1171             Check(ObjectEnd, "}"),
1172             Check(Comma, ","),
1173             Check(True, "true"),
1174             Check(Comma, ","),
1175             Check(False, "false"),
1176             Check(Comma, ","),
1177             Check(Null, "null"),
1178             Check(ArrayEnd, "]"),
1179             Check(ObjectEnd, "}"),
1180             Check(EOF, "")];
1181         auto checkWithReplaceEscapes = checkitems.dup;
1182         checkWithReplaceEscapes[replaceItem][1] = "\r\n\f\b%";
1183 
1184         import std.meta: AliasSeq;
1185         foreach(T; AliasSeq!(char, wchar, dchar))
1186         {
1187             import std.conv: to;
1188             verifyJson!false(jsonData.to!(T[]), checkitems);
1189             verifyJson!true(jsonData.to!(T[]), checkWithReplaceEscapes);
1190         }
1191 
1192         // now, test to make sure the parser fails properly
1193         verifyJson!false(q"{123.456}", [Check(Error, "123.456")]);
1194         verifyJson!false(q"{{123.456}}", [Check(ObjectStart, "{"), Check(Error, "123.456")]);
1195     }
1196 }
1197 
1198 unittest
1199 {
1200     // test caching
1201     auto jsonData = q"{{"a": 1,"b": 123.456, "c": null}}";
1202     auto parser = jsonData.jsonTokenizer!false;
1203     bool check(JSONItem item, JSONToken token, string expected)
1204     {
1205         return(item.token == token && item.data(parser.chain) == expected);
1206     }
1207     with(JSONToken)
1208     {
1209         assert(check(parser.next, ObjectStart, "{"));
1210         assert(check(parser.next, String, "a"));
1211         assert(check(parser.next, Colon, ":"));
1212 
1213         // cache the start of a value
1214         parser.startCache;
1215         assert(check(parser.next, Number, "1"));
1216         assert(check(parser.next, Comma, ","));
1217         assert(check(parser.next, String, "b"));
1218         assert(check(parser.next, Colon, ":"));
1219 
1220         // replay the cache for the value of a
1221         parser.rewind();
1222         assert(check(parser.next, Number, "1"));
1223         assert(check(parser.next, Comma, ","));
1224 
1225         // now with the cache still in there, restart the cache
1226         parser.startCache;
1227         assert(check(parser.next, String, "b"));
1228         assert(check(parser.next, Colon, ":"));
1229         assert(check(parser.next, Number, "123.456"));
1230         assert(check(parser.next, Comma, ","));
1231 
1232         // replay b again
1233         parser.rewind();
1234         parser.endCache();
1235         assert(check(parser.next, String, "b"));
1236         assert(check(parser.next, Colon, ":"));
1237         // test out releasing cached data
1238         parser.releaseParsed();
1239         assert(check(parser.next, Number, "123.456"));
1240         assert(check(parser.next, Comma, ","));
1241         assert(check(parser.next, String, "c"));
1242         assert(check(parser.next, Colon, ":"));
1243         assert(check(parser.next, Null, "null"));
1244         // the cache should now be exhausted
1245         assert(parser.cache.length == 0);
1246         assert(parser.cIdx == 0);
1247         assert(check(parser.next, ObjectEnd, "}"));
1248     }
1249 }
1250 
1251 unittest
1252 {
1253     // test skipping items
1254     auto jsonData = q"{{"a" : 1, "b" : {"c" : [1,2,3], "d" : { "hello" : "world" }}}}";
1255 
1256     auto parser = jsonData.jsonTokenizer!false;
1257     bool check(JSONItem item, JSONToken token, string expected)
1258     {
1259         if(item.token == token && item.data(parser.chain) == expected)
1260             return true;
1261         import std.stdio;
1262         writeln(item);
1263         return false;
1264     }
1265     parser.startCache;
1266     auto item = parser.skipItem(); // skip it all;
1267     assert(check(item, JSONToken.EOF, ""));
1268     assert(parser.position == jsonData.length);
1269 
1270     // start over
1271     parser.rewind;
1272     assert(check(parser.next, JSONToken.ObjectStart, "{"));
1273     assert(check(parser.skipItem, JSONToken.Comma, ","));
1274     assert(check(parser.next, JSONToken.String, "b"));
1275     assert(check(parser.skipItem, JSONToken.ObjectEnd, "}"));
1276     assert(check(parser.next, JSONToken.EOF, ""));
1277     assert(parser.position == jsonData.length);
1278 
1279     // another try
1280     parser.rewind;
1281     assert(check(parser.next, JSONToken.ObjectStart, "{"));
1282     assert(check(parser.skipItem, JSONToken.Comma, ","));
1283     assert(check(parser.next, JSONToken.String, "b"));
1284     assert(check(parser.next, JSONToken.Colon, ":"));
1285     assert(check(parser.next, JSONToken.ObjectStart, "{"));
1286     assert(check(parser.next, JSONToken.String, "c"));
1287     assert(check(parser.skipItem, JSONToken.Comma, ","));
1288     assert(check(parser.next, JSONToken.String, "d"));
1289     assert(check(parser.skipItem, JSONToken.ObjectEnd, "}"));
1290     assert(check(parser.skipItem, JSONToken.ObjectEnd, "}"));
1291     assert(check(parser.next, JSONToken.EOF, ""));
1292     assert(parser.position == jsonData.length);
1293 
1294     // test parseTo
1295     parser.rewind;
1296     assert(parser.parseTo("b", "d", "hello"));
1297     assert(check(parser.next, JSONToken.String, "world"));
1298     assert(check(parser.next, JSONToken.ObjectEnd, "}"));
1299     assert(check(parser.next, JSONToken.ObjectEnd, "}"));
1300     assert(check(parser.next, JSONToken.ObjectEnd, "}"));
1301     assert(check(parser.next, JSONToken.EOF, ""));
1302     assert(parser.position == jsonData.length);
1303 }