LLDB mainline
Swift/tree-sitter-swift/scanner.c
Go to the documentation of this file.
1#include "tree_sitter/parser.h"
2#include <string.h>
3#include <wctype.h>
4
5#define TOKEN_COUNT 33
6
42
43#define OPERATOR_COUNT 20
44
45const char *OPERATORS[OPERATOR_COUNT] = {
46 "->", ".", "&&", "||", "??", "=", "==",
47 "+", "-", "!", "throws", "rethrows", "default", "where",
48 "else", "catch", "as", "as?", "as!", "async"};
49
56
58 OPERATOR_SYMBOLS, // ->
59 OPERATOR_OR_DOT, // .
60 OPERATOR_SYMBOLS, // &&
61 OPERATOR_SYMBOLS, // ||
62 OPERATOR_SYMBOLS, // ??
64 OPERATOR_SYMBOLS, // ==
65 NON_WHITESPACE, // +
66 NON_WHITESPACE, // -
68 ALPHANUMERIC, // throws
69 ALPHANUMERIC, // rethrows
70 ALPHANUMERIC, // default
71 ALPHANUMERIC, // where
72 ALPHANUMERIC, // else
73 ALPHANUMERIC, // catch
74 ALPHANUMERIC, // as
75 OPERATOR_SYMBOLS, // as?
76 OPERATOR_SYMBOLS, // as!
77 ALPHANUMERIC // async
78};
79
100
102 0, // ARROW_OPERATOR,
103 0, // DOT_OPERATOR,
104 0, // CONJUNCTION_OPERATOR,
105 0, // DISJUNCTION_OPERATOR,
106 0, // NIL_COALESCING_OPERATOR,
107 0, // EQUAL_SIGN,
108 0, // EQ_EQ,
109 0, // PLUS_THEN_WS,
110 0, // MINUS_THEN_WS,
111 1UL << FAKE_TRY_BANG, // BANG,
112 0, // THROWS_KEYWORD,
113 0, // RETHROWS_KEYWORD,
114 0, // DEFAULT_KEYWORD,
115 0, // WHERE_KEYWORD,
116 0, // ELSE_KEYWORD,
117 0, // CATCH_KEYWORD,
118 0, // AS_KEYWORD,
119 0, // AS_QUEST,
120 0, // AS_BANG,
121 0, // ASYNC_KEYWORD
122};
123
124#define RESERVED_OP_COUNT 31
125
127 "/", "=", "-", "+", "!", "*", "%", "<", ">", "&", "|",
128 "^", "?", "~", ".", "..", "->", "/*", "*/", "+=", "-=", "*=",
129 "/=", "%=", ">>", "<<", "++", "--", "===", "...", "..<"};
130
131static bool is_cross_semi_token(enum TokenType op) {
132 switch (op) {
133 case ARROW_OPERATOR:
134 case DOT_OPERATOR:
138 case EQUAL_SIGN:
139 case EQ_EQ:
140 case PLUS_THEN_WS:
141 case MINUS_THEN_WS:
142 case THROWS_KEYWORD:
143 case RETHROWS_KEYWORD:
144 case DEFAULT_KEYWORD:
145 case WHERE_KEYWORD:
146 case ELSE_KEYWORD:
147 case CATCH_KEYWORD:
148 case AS_KEYWORD:
149 case AS_QUEST:
150 case AS_BANG:
151 case ASYNC_KEYWORD:
152 case CUSTOM_OPERATOR:
153 return true;
154 case BANG:
155 default:
156 return false;
157 }
158}
159
160#define NON_CONSUMING_CROSS_SEMI_CHAR_COUNT 3
161const uint32_t
164
165/**
166 * All possible results of having performed some sort of parsing.
167 *
168 * A parser can return a result along two dimensions:
169 * 1. Should the scanner continue trying to find another result?
170 * 2. Was some result produced by this parsing attempt?
171 *
172 * These are flattened into a single enum together. When the function returns
173 * one of the `TOKEN_FOUND` cases, it will always populate its `symbol_result`
174 * field. When it returns one of the `STOP_PARSING` cases, callers should
175 * immediately return (with the value, if there is one).
176 */
185
189
191 return calloc(0, sizeof(struct ScannerState));
192}
193
195 free(payload);
196}
197
199 struct ScannerState *state = (struct ScannerState *)payload;
201}
202
204 char *buffer) {
205 struct ScannerState *state = (struct ScannerState *)payload;
206 uint32_t hash_count = state->ongoing_raw_str_hash_count;
207 buffer[0] = (hash_count >> 24) & 0xff;
208 buffer[1] = (hash_count >> 16) & 0xff;
209 buffer[2] = (hash_count >> 8) & 0xff;
210 buffer[3] = (hash_count) & 0xff;
211 return 4;
212}
213
215 const char *buffer,
216 unsigned length) {
217 if (length < 4) {
218 return;
219 }
220
221 uint32_t hash_count =
222 ((((uint32_t)buffer[0]) << 24) | (((uint32_t)buffer[1]) << 16) |
223 (((uint32_t)buffer[2]) << 8) | (((uint32_t)buffer[3])));
224 struct ScannerState *state = (struct ScannerState *)payload;
225 state->ongoing_raw_str_hash_count = hash_count;
226}
227
228static void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
229
230static bool should_treat_as_wspace(int32_t character) {
231 return iswspace(character) || (((int32_t)';') == character);
232}
233
234static int32_t encountered_op_count(bool *encountered_operator) {
235 int32_t encountered = 0;
236 for (int op_idx = 0; op_idx < OPERATOR_COUNT; op_idx++) {
237 if (encountered_operator[op_idx]) {
238 encountered++;
239 }
240 }
241
242 return encountered;
243}
244
245static bool any_reserved_ops(uint8_t *encountered_reserved_ops) {
246 for (int op_idx = 0; op_idx < RESERVED_OP_COUNT; op_idx++) {
247 if (encountered_reserved_ops[op_idx] == 2) {
248 return true;
249 }
250 }
251
252 return false;
253}
254
255static bool is_legal_custom_operator(int32_t char_idx, int32_t first_char,
256 int32_t cur_char) {
257 bool is_first_char = !char_idx;
258 switch (cur_char) {
259 case '=':
260 case '-':
261 case '+':
262 case '!':
263 case '%':
264 case '<':
265 case '>':
266 case '&':
267 case '|':
268 case '^':
269 case '?':
270 case '~':
271 return true;
272 case '.':
273 // Grammar allows `.` for any operator that starts with `.`
274 return is_first_char || first_char == '.';
275 case '*':
276 case '/':
277 // Not listed in the grammar, but `/*` and `//` can't be the start of an
278 // operator since they start comments
279 return char_idx != 1 || first_char != '/';
280 default:
281 if ((cur_char >= 0x00A1 && cur_char <= 0x00A7) || (cur_char == 0x00A9) ||
282 (cur_char == 0x00AB) || (cur_char == 0x00AC) || (cur_char == 0x00AE) ||
283 (cur_char >= 0x00B0 && cur_char <= 0x00B1) || (cur_char == 0x00B6) ||
284 (cur_char == 0x00BB) || (cur_char == 0x00BF) || (cur_char == 0x00D7) ||
285 (cur_char == 0x00F7) || (cur_char >= 0x2016 && cur_char <= 0x2017) ||
286 (cur_char >= 0x2020 && cur_char <= 0x2027) ||
287 (cur_char >= 0x2030 && cur_char <= 0x203E) ||
288 (cur_char >= 0x2041 && cur_char <= 0x2053) ||
289 (cur_char >= 0x2055 && cur_char <= 0x205E) ||
290 (cur_char >= 0x2190 && cur_char <= 0x23FF) ||
291 (cur_char >= 0x2500 && cur_char <= 0x2775) ||
292 (cur_char >= 0x2794 && cur_char <= 0x2BFF) ||
293 (cur_char >= 0x2E00 && cur_char <= 0x2E7F) ||
294 (cur_char >= 0x3001 && cur_char <= 0x3003) ||
295 (cur_char >= 0x3008 && cur_char <= 0x3020) || (cur_char == 0x3030)) {
296 return true;
297 } else if ((cur_char >= 0x0300 && cur_char <= 0x036f) ||
298 (cur_char >= 0x1DC0 && cur_char <= 0x1DFF) ||
299 (cur_char >= 0x20D0 && cur_char <= 0x20FF) ||
300 (cur_char >= 0xFE00 && cur_char <= 0xFE0F) ||
301 (cur_char >= 0xFE20 && cur_char <= 0xFE2F) ||
302 (cur_char >= 0xE0100 && cur_char <= 0xE01EF)) {
303 return !is_first_char;
304 } else {
305 return false;
306 }
307 }
308}
309
310static bool eat_operators(TSLexer *lexer, const bool *valid_symbols,
311 bool mark_end, const int32_t prior_char,
312 enum TokenType *symbol_result) {
313 bool possible_operators[OPERATOR_COUNT];
314 uint8_t reserved_operators[RESERVED_OP_COUNT];
315 for (int op_idx = 0; op_idx < OPERATOR_COUNT; op_idx++) {
316 possible_operators[op_idx] =
317 valid_symbols[OP_SYMBOLS[op_idx]] &&
318 (!prior_char || OPERATORS[op_idx][0] == prior_char);
319 }
320 for (int op_idx = 0; op_idx < RESERVED_OP_COUNT; op_idx++) {
321 reserved_operators[op_idx] =
322 !prior_char || RESERVED_OPS[op_idx][0] == prior_char;
323 }
324
325 bool possible_custom_operator = valid_symbols[CUSTOM_OPERATOR];
326 int32_t first_char = prior_char ? prior_char : lexer->lookahead;
327 int32_t last_examined_char = first_char;
328
329 int32_t str_idx = prior_char ? 1 : 0;
330 int32_t full_match = -1;
331 while (true) {
332 for (int op_idx = 0; op_idx < OPERATOR_COUNT; op_idx++) {
333 if (!possible_operators[op_idx]) {
334 continue;
335 }
336
337 if (OPERATORS[op_idx][str_idx] == '\0') {
338 // Make sure that the operator is allowed to have the next character as
339 // its lookahead.
340 enum IllegalTerminatorGroup illegal_terminators =
342 switch (lexer->lookahead) {
343 // See "Operators":
344 // https://docs.swift.org/swift-book/ReferenceManual/LexicalStructure.html#ID418
345 case '/':
346 case '=':
347 case '-':
348 case '+':
349 case '!':
350 case '*':
351 case '%':
352 case '<':
353 case '>':
354 case '&':
355 case '|':
356 case '^':
357 case '?':
358 case '~':
359 if (illegal_terminators == OPERATOR_SYMBOLS) {
360 break;
361 } // Otherwise, intentionally fall through to the OPERATOR_OR_DOT case
362 // fall through
363 case '.':
364 if (illegal_terminators == OPERATOR_OR_DOT) {
365 break;
366 } // Otherwise, fall through to DEFAULT which checks its groups
367 // directly
368 // fall through
369 default:
370 if (iswalnum(lexer->lookahead) &&
371 illegal_terminators == ALPHANUMERIC) {
372 break;
373 }
374
375 if (!iswspace(lexer->lookahead) &&
376 illegal_terminators == NON_WHITESPACE) {
377 break;
378 }
379
380 full_match = op_idx;
381 if (mark_end) {
382 lexer->mark_end(lexer);
383 }
384 }
385
386 possible_operators[op_idx] = false;
387 continue;
388 }
389
390 if (OPERATORS[op_idx][str_idx] != lexer->lookahead) {
391 possible_operators[op_idx] = false;
392 continue;
393 }
394 }
395
396 for (int op_idx = 0; op_idx < RESERVED_OP_COUNT; op_idx++) {
397 if (!reserved_operators[op_idx]) {
398 continue;
399 }
400
401 if (RESERVED_OPS[op_idx][str_idx] == '\0') {
402 reserved_operators[op_idx] = 0;
403 continue;
404 }
405
406 if (RESERVED_OPS[op_idx][str_idx] != lexer->lookahead) {
407 reserved_operators[op_idx] = 0;
408 continue;
409 }
410
411 if (RESERVED_OPS[op_idx][str_idx + 1] == '\0') {
412 reserved_operators[op_idx] = 2;
413 continue;
414 }
415 }
416
417 possible_custom_operator =
418 possible_custom_operator &&
419 is_legal_custom_operator(str_idx, first_char, lexer->lookahead);
420
421 uint32_t encountered_ops = encountered_op_count(possible_operators);
422 if (encountered_ops == 0) {
423 if (!possible_custom_operator) {
424 break;
425 } else if (mark_end && full_match == -1) {
426 lexer->mark_end(lexer);
427 }
428 }
429
430 last_examined_char = lexer->lookahead;
431 lexer->advance(lexer, false);
432 str_idx += 1;
433
434 if (encountered_ops == 0 &&
435 !is_legal_custom_operator(str_idx, first_char, lexer->lookahead)) {
436 break;
437 }
438 }
439
440 if (full_match != -1) {
441 // We have a match -- first see if that match has a symbol that suppresses
442 // it. For example, in `try!`, we do not want to emit the `!` as a symbol in
443 // our scanner, because we want the parser to have the chance to parse it as
444 // an immediate token.
445 uint64_t suppressing_symbols = OP_SYMBOL_SUPPRESSOR[full_match];
446 if (suppressing_symbols) {
447 for (uint64_t suppressor = 0; suppressor < TOKEN_COUNT; suppressor++) {
448 if (!(suppressing_symbols & 1 << suppressor)) {
449 continue;
450 }
451
452 // The suppressing symbol is valid in this position, so skip it.
453 if (valid_symbols[suppressor]) {
454 return false;
455 }
456 }
457 }
458 *symbol_result = OP_SYMBOLS[full_match];
459 return true;
460 }
461
462 if (possible_custom_operator && !any_reserved_ops(reserved_operators)) {
463 if ((last_examined_char != '<' || iswspace(lexer->lookahead)) && mark_end) {
464 lexer->mark_end(lexer);
465 }
466 *symbol_result = CUSTOM_OPERATOR;
467 return true;
468 }
469
470 return false;
471}
472
473static enum ParseDirective eat_comment(TSLexer *lexer,
474 const bool *valid_symbols, bool mark_end,
475 enum TokenType *symbol_result) {
476 if (lexer->lookahead != '/') {
478 }
479
480 advance(lexer);
481
482 if (lexer->lookahead != '*') {
484 }
485
486 advance(lexer);
487
488 bool after_star = false;
489 unsigned nesting_depth = 1;
490 for (;;) {
491 switch (lexer->lookahead) {
492 case '\0':
494 case '*':
495 advance(lexer);
496 after_star = true;
497 break;
498 case '/':
499 if (after_star) {
500 advance(lexer);
501 after_star = false;
502 nesting_depth--;
503 if (nesting_depth == 0) {
504 if (mark_end) {
505 lexer->mark_end(lexer);
506 }
507 *symbol_result = BLOCK_COMMENT;
509 }
510 } else {
511 advance(lexer);
512 after_star = false;
513 if (lexer->lookahead == '*') {
514 nesting_depth++;
515 advance(lexer);
516 }
517 }
518 break;
519 default:
520 advance(lexer);
521 after_star = false;
522 break;
523 }
524 }
525}
526
527static enum ParseDirective eat_whitespace(TSLexer *lexer,
528 const bool *valid_symbols,
529 enum TokenType *symbol_result) {
531 bool semi_is_valid =
532 valid_symbols[IMPLICIT_SEMI] && valid_symbols[EXPLICIT_SEMI];
533 uint32_t lookahead;
534 while (should_treat_as_wspace(lookahead = lexer->lookahead)) {
535 if (lookahead == ';') {
536 if (semi_is_valid) {
537 ws_directive = STOP_PARSING_TOKEN_FOUND;
538 lexer->advance(lexer, false);
539 }
540
541 break;
542 }
543
544 lexer->advance(lexer, true);
545
546 lexer->mark_end(lexer);
547
548 if (ws_directive == CONTINUE_PARSING_NOTHING_FOUND &&
549 (lookahead == '\n' || lookahead == '\r')) {
550 ws_directive = CONTINUE_PARSING_TOKEN_FOUND;
551 }
552 }
553
555 if (ws_directive == CONTINUE_PARSING_TOKEN_FOUND && lookahead == '/') {
556 bool has_seen_single_comment = false;
557 while (lexer->lookahead == '/') {
558 // It's possible that this is a comment - start an exploratory mission to
559 // find out, and if it is, look for what comes after it. We care about
560 // what comes after it for the purpose of suppressing the newline.
561
562 enum TokenType multiline_comment_result;
563 any_comment = eat_comment(lexer, valid_symbols, /* mark_end */ false,
564 &multiline_comment_result);
565 if (any_comment == STOP_PARSING_TOKEN_FOUND) {
566 // This is a multiline comment. This scanner should be parsing those, so
567 // we might want to bail out and emit it instead. However, we only want
568 // to do that if we haven't advanced through a _single_ line comment on
569 // the way - otherwise that will get lumped into this.
570 if (!has_seen_single_comment) {
571 lexer->mark_end(lexer);
572 *symbol_result = multiline_comment_result;
574 }
575 } else if (any_comment == STOP_PARSING_END_OF_FILE) {
577 } else if (any_comment == CONTINUE_PARSING_SLASH_CONSUMED) {
578 // We accidentally ate a slash -- we should actually bail out, say we
579 // saw nothing, and let the next pass take it from after the newline.
581 } else if (lexer->lookahead == '/') {
582 // There wasn't a multiline comment, which we know means that the
583 // comment parser ate its `/` and then bailed out. If it had seen
584 // anything comment-like after that first `/` it would have continued
585 // going and eventually had a well-formed comment or an EOF. Thus, if
586 // we're currently looking at a `/`, it's the second one of those and it
587 // means we have a single-line comment.
588 has_seen_single_comment = true;
589 while (lexer->lookahead != '\n' && lexer->lookahead != '\0') {
590 lexer->advance(lexer, true);
591 }
592 } else if (iswspace(lexer->lookahead)) {
593 // We didn't see any type of comment - in fact, we saw an operator that
594 // we don't normally treat as an operator. Still, this is a reason to
595 // stop parsing.
597 }
598
599 // If we skipped through some comment, we're at whitespace now, so
600 // advance.
601 while (iswspace(lexer->lookahead)) {
602 any_comment = CONTINUE_PARSING_NOTHING_FOUND; // We're advancing, so
603 // clear out the comment
604 lexer->advance(lexer, true);
605 }
606 }
607
608 enum TokenType operator_result;
609 bool saw_operator =
610 eat_operators(lexer, valid_symbols,
611 /* mark_end */ false, '\0', &operator_result);
612 if (saw_operator) {
613 // The operator we saw should suppress the newline, so bail out.
615 } else {
616 // Promote the implicit newline to an explicit one so we don't check for
617 // operators again.
618 *symbol_result = IMPLICIT_SEMI;
619 ws_directive = STOP_PARSING_TOKEN_FOUND;
620 }
621 }
622
623 // Let's consume operators that can live after a "semicolon" style newline.
624 // Before we do that, though, we want to check for a set of characters that we
625 // do not consume, but that still suppress the semi.
626 if (ws_directive == CONTINUE_PARSING_TOKEN_FOUND) {
627 for (int i = 0; i < NON_CONSUMING_CROSS_SEMI_CHAR_COUNT; i++) {
628 if (NON_CONSUMING_CROSS_SEMI_CHARS[i] == lookahead) {
630 }
631 }
632 }
633
634 if (semi_is_valid && ws_directive != CONTINUE_PARSING_NOTHING_FOUND) {
635 *symbol_result = lookahead == ';' ? EXPLICIT_SEMI : IMPLICIT_SEMI;
636 return ws_directive;
637 }
638
640}
641
642#define DIRECTIVE_COUNT 4
643const char *DIRECTIVES[OPERATOR_COUNT] = {"if", "elseif", "else", "endif"};
644
647
648static enum TokenType find_possible_compiler_directive(TSLexer *lexer) {
649 bool possible_directives[DIRECTIVE_COUNT];
650 for (int dir_idx = 0; dir_idx < DIRECTIVE_COUNT; dir_idx++) {
651 possible_directives[dir_idx] = true;
652 }
653
654 int32_t str_idx = 0;
655 int32_t full_match = -1;
656 while (true) {
657 for (int dir_idx = 0; dir_idx < DIRECTIVE_COUNT; dir_idx++) {
658 if (!possible_directives[dir_idx]) {
659 continue;
660 }
661
662 uint8_t expected_char = DIRECTIVES[dir_idx][str_idx];
663 if (expected_char == '\0') {
664 full_match = dir_idx;
665 lexer->mark_end(lexer);
666 }
667
668 if (expected_char != lexer->lookahead) {
669 possible_directives[dir_idx] = false;
670 continue;
671 }
672 }
673
674 uint8_t match_count = 0;
675 for (int dir_idx = 0; dir_idx < DIRECTIVE_COUNT; dir_idx += 1) {
676 if (possible_directives[dir_idx]) {
677 match_count += 1;
678 }
679 }
680
681 if (match_count == 0) {
682 break;
683 }
684
685 lexer->advance(lexer, false);
686 str_idx += 1;
687 }
688
689 if (full_match == -1) {
690 // No compiler directive found, so just match the starting symbol
691 return HASH_SYMBOL;
692 }
693
694 return DIRECTIVE_SYMBOLS[full_match];
695}
696
697static bool eat_raw_str_part(struct ScannerState *state, TSLexer *lexer,
698 const bool *valid_symbols,
699 enum TokenType *symbol_result) {
700 uint32_t hash_count = state->ongoing_raw_str_hash_count;
701 if (!valid_symbols[RAW_STR_PART]) {
702 return false;
703 } else if (hash_count == 0) {
704 // If this is a raw_str_part, it's the first one - look for hashes
705 while (lexer->lookahead == '#') {
706 hash_count += 1;
707 advance(lexer);
708 }
709
710 if (hash_count == 0) {
711 return false;
712 }
713
714 if (lexer->lookahead == '"') {
715 advance(lexer);
716 } else if (hash_count == 1) {
717 lexer->mark_end(lexer);
718 *symbol_result = find_possible_compiler_directive(lexer);
719 return true;
720 } else {
721 return false;
722 }
723
724 } else if (valid_symbols[RAW_STR_CONTINUING_INDICATOR]) {
725 // This is the end of an interpolation - now it's another raw_str_part. This
726 // is a synthetic marker to tell us that the grammar just consumed a `(`
727 // symbol to close a raw interpolation (since we don't want to fire on every
728 // `(` in existence). We don't have anything to do except continue.
729 } else {
730 return false;
731 }
732
733 // We're in a state where anything other than `hash_count` hash symbols in a
734 // row should be eaten and is part of a string. The last character _before_
735 // the hashes will tell us what happens next. Matters are also complicated by
736 // the fact that we don't want to consume every character we visit; if we see
737 // a `\#(`, for instance, with the appropriate number of hash symbols, we want
738 // to end our parsing _before_ that sequence. This allows highlighting tools
739 // to treat that as a separate token.
740 while (lexer->lookahead != '\0') {
741 uint8_t last_char = '\0';
742 lexer->mark_end(
743 lexer); // We always want to parse thru the start of the string so far
744 // Advance through anything that isn't a hash symbol, because we want to
745 // count those.
746 while (lexer->lookahead != '#' && lexer->lookahead != '\0') {
747 last_char = lexer->lookahead;
748 advance(lexer);
749 if (last_char != '\\' || lexer->lookahead == '\\') {
750 // Mark a new end, but only if we didn't just advance past a `\` symbol,
751 // since we don't want to consume that. Exception: if this is a `\` that
752 // happens _right after_ another `\`, we for some reason _do_ want to
753 // consume that, because apparently that is parsed as a literal `\`
754 // followed by something escaped.
755 lexer->mark_end(lexer);
756 }
757 }
758
759 // We hit at least one hash - count them and see if they match.
760 uint32_t current_hash_count = 0;
761 while (lexer->lookahead == '#' && current_hash_count < hash_count) {
762 current_hash_count += 1;
763 advance(lexer);
764 }
765
766 // If we saw exactly the right number of hashes, one of three things is
767 // true:
768 // 1. We're trying to interpolate into this string.
769 // 2. The string just ended.
770 // 3. This was just some hash characters doing nothing important.
771 if (current_hash_count == hash_count) {
772 if (last_char == '\\' && lexer->lookahead == '(') {
773 // Interpolation case! Don't consume those chars; they get saved for
774 // grammar.js.
775 *symbol_result = RAW_STR_PART;
776 state->ongoing_raw_str_hash_count = hash_count;
777 return true;
778 } else if (last_char == '"') {
779 // The string is finished! Mark the end here, on the very last hash
780 // symbol.
781 lexer->mark_end(lexer);
782 *symbol_result = RAW_STR_END_PART;
784 return true;
785 }
786 // Nothing special happened - let the string continue.
787 }
788 }
789
790 return false;
791}
792
793bool tree_sitter_swift_external_scanner_scan(void *payload, TSLexer *lexer,
794 const bool *valid_symbols) {
795 // Figure out our scanner state
796 struct ScannerState *state = (struct ScannerState *)payload;
797
798 // Consume any whitespace at the start.
799 enum TokenType ws_result;
800 enum ParseDirective ws_directive =
801 eat_whitespace(lexer, valid_symbols, &ws_result);
802 if (ws_directive == STOP_PARSING_TOKEN_FOUND) {
803 lexer->result_symbol = ws_result;
804 return true;
805 }
806
807 if (ws_directive == STOP_PARSING_NOTHING_FOUND ||
808 ws_directive == STOP_PARSING_END_OF_FILE) {
809 return false;
810 }
811
812 bool has_ws_result = (ws_directive == CONTINUE_PARSING_TOKEN_FOUND);
813
814 // Now consume comments (before custom operators so that those aren't treated
815 // as comments)
816 enum TokenType comment_result;
817 enum ParseDirective comment =
818 ws_directive == CONTINUE_PARSING_SLASH_CONSUMED
819 ? ws_directive
820 : eat_comment(lexer, valid_symbols, /* mark_end */ true,
821 &comment_result);
822 if (comment == STOP_PARSING_TOKEN_FOUND) {
823 lexer->mark_end(lexer);
824 lexer->result_symbol = comment_result;
825 return true;
826 }
827
828 if (comment == STOP_PARSING_END_OF_FILE) {
829 return false;
830 }
831 // Now consume any operators that might cause our whitespace to be suppressed.
832 enum TokenType operator_result;
833 bool saw_operator =
834 eat_operators(lexer, valid_symbols,
835 /* mark_end */ !has_ws_result,
836 comment == CONTINUE_PARSING_SLASH_CONSUMED ? '/' : '\0',
837 &operator_result);
838
839 if (saw_operator &&
840 (!has_ws_result || is_cross_semi_token(operator_result))) {
841 lexer->result_symbol = operator_result;
842 if (has_ws_result)
843 lexer->mark_end(lexer);
844 return true;
845 }
846
847 if (has_ws_result) {
848 // Don't `mark_end`, since we may have advanced through some operators.
849 lexer->result_symbol = ws_result;
850 return true;
851 }
852
853 // NOTE: this will consume any `#` characters it sees, even if it does not
854 // find a result. Keep it at the end so that it doesn't interfere with special
855 // literals or selectors!
856 enum TokenType raw_str_result;
857 bool saw_raw_str_part =
858 eat_raw_str_part(state, lexer, valid_symbols, &raw_str_result);
859 if (saw_raw_str_part) {
860 lexer->result_symbol = raw_str_result;
861 return true;
862 }
863
864 return false;
865}
void tree_sitter_swift_external_scanner_destroy(void *payload)
void tree_sitter_swift_external_scanner_deserialize(void *payload, const char *buffer, unsigned length)
static enum ParseDirective eat_comment(TSLexer *lexer, const bool *valid_symbols, bool mark_end, enum TokenType *symbol_result)
static int32_t encountered_op_count(bool *encountered_operator)
void * tree_sitter_swift_external_scanner_create()
static void advance(TSLexer *lexer)
void tree_sitter_swift_external_scanner_reset(void *payload)
#define NON_CONSUMING_CROSS_SEMI_CHAR_COUNT
static enum ParseDirective eat_whitespace(TSLexer *lexer, const bool *valid_symbols, enum TokenType *symbol_result)
const uint32_t NON_CONSUMING_CROSS_SEMI_CHARS[NON_CONSUMING_CROSS_SEMI_CHAR_COUNT]
const char * RESERVED_OPS[RESERVED_OP_COUNT]
ParseDirective
All possible results of having performed some sort of parsing.
const uint64_t OP_SYMBOL_SUPPRESSOR[OPERATOR_COUNT]
static bool is_legal_custom_operator(int32_t char_idx, int32_t first_char, int32_t cur_char)
bool tree_sitter_swift_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols)
static bool eat_raw_str_part(struct ScannerState *state, TSLexer *lexer, const bool *valid_symbols, enum TokenType *symbol_result)
unsigned tree_sitter_swift_external_scanner_serialize(void *payload, char *buffer)
static bool should_treat_as_wspace(int32_t character)
static enum TokenType find_possible_compiler_directive(TSLexer *lexer)
static bool is_cross_semi_token(enum TokenType op)
const char * OPERATORS[OPERATOR_COUNT]
enum TokenType OP_SYMBOLS[OPERATOR_COUNT]
const char * DIRECTIVES[OPERATOR_COUNT]
static bool any_reserved_ops(uint8_t *encountered_reserved_ops)
enum IllegalTerminatorGroup OP_ILLEGAL_TERMINATORS[OPERATOR_COUNT]
static bool eat_operators(TSLexer *lexer, const bool *valid_symbols, bool mark_end, const int32_t prior_char, enum TokenType *symbol_result)
#define RESERVED_OP_COUNT
enum TokenType DIRECTIVE_SYMBOLS[DIRECTIVE_COUNT]