Syntax Analyse in C++

Martin Sulzmann

Syntax Analyse

Darstellung von Syntax

Konkrete versus abstrakte Syntax.

Darstellung in C++?

Parsing

Allgemeiner Parser. Siehe Cocke–Younger–Kasami algorithm.

Kubische Komplexität.

Wir verwenden Top-Down Parsing.

Deterministisches parsen in linearer Zeit.

Am Beispiel von reguläre Ausdrücken.

Weitere Beispiele und Details siehe hier.

Reguläre Ausdrücke

R ::= x           // Alphabet symbol
    | R + R       // Alternative
    | R . R       // Konkatenation
    | R*          // Kleensche Hülle
    | (R)         // Klammerung

beschrieben durch eine kontext-freie Grammatik in EBNF.

Beispiele

x*

(x*) . (y*)

(x + y) . z

Beachte:

Darstellung in C++

Wir unterscheiden zwischen konkreter und abstrakter Syntax Darstellung.

Wir verwenden

Unten-stehender Programmcode verwendet

// Polymorphie am Beispiel  regulärer Ausdrücke.
//
// R ::= x | R + R | R . R | R* | (R)
//
// 1. Templates zwecks Emulation von parametrischer Polymorphie (aka "generics")
// 2. Overloading
// 3. Subtyping



#include <iostream>
#include <string>
#include <memory>
using namespace std;


class RE_ {
public:
  virtual string show() = 0;     // Quasi eine pretty-print Methode.
  virtual bool nullable() = 0;   // Ueberprueft, ob der regulaere Ausdruck,
                                 // das leere Wort enthaelt.

};

// Zwecks virtueller Methodenauswahl, müssen Objekte via Referenz erreichbar sein.
// Wir verwenden "smart pointers".
typedef shared_ptr<RE_> RE;                            // TEMPLATE


// Epsilon, the empty word.
class Eps : public RE_ {
public:
  string show() { return "eps"; }
  bool nullable() { return true; }
};

// Alphabet symbols = letters.
class Letter : public RE_ {
  char x;
public:
  Letter(char y) { x = y; }
  string show() { return string(1,x); }
  bool nullable() { return false; }
};

// Alternatives.
class Alt : public RE_ {
  RE left;
  RE right;
public:
  Alt(RE l, RE r) { left = l; right = r; }
  string show() {
    string s("(");
    s.append(left->show());                          // OVERLOADING
    s.append("+");
    s.append(right->show());
    s.append(")");
    return s;
  }
  bool nullable() { return (left->nullable() || right->nullable()); }
};


// Concatenation.
class Conc : public RE_ {
  RE left;
  RE right;
public:
  Conc(RE l, RE r) { left = l; right = r; }
  string show() {
    string s("(");
    s.append(left->show());
    s.append(".");
    s.append(right->show());
    s.append(")");
    return s;
  }
  bool nullable() { return (left->nullable() && right->nullable()); }
};

// Kleene star.
class Star : public RE_ {
  RE r;
public:
  Star(RE s) { r = s; }
  string show() {
    string s("(");
    s.append(r->show());
    s.append("*)");
    return s;
  }
  bool nullable() { return true; }
};

// Helpers

// Point to note.
// Co-variant subtyping here!
// shared_ptr<Eps> <= shared_part<RE_>.
RE mkEps() { return make_shared<Eps>(Eps()); }                     // SUBTYPING

RE mkLetter(char x) { return make_shared<Letter>(Letter(x)); }

RE mkAlt(RE l, RE r) { return make_shared<Alt>(Alt(l,r)); }

RE mkConc(RE l, RE r) { return make_shared<Conc>(Conc(l,r)); }

RE mkStar(RE r) { return make_shared<Star>(Star(r)); }


// Beispiele

void test() {

  // (x + eps) . (y*)
  RE r = mkConc(mkAlt(mkLetter('x'), mkEps()), mkStar(mkLetter('y')));

  cout << "\n" << r->show();

  cout << "\n" << r->nullable();

}



int main() {

  test();

}


/*

Beachte.

TEMPLATE (siehe oben).

     shared_ptr sind "generisch" in dem zu verwaltenden Datentyp.

OVERLOADING (siehe oben).

      left->show() entspricht   (*left).show()

      left ist vom Typ RE, RE ist gleich shared_ptr<RE_>

      Der Dereferenzierungsoperator "*" ist überladen.

SUBTYPING (siehe oben).

      make_shared<Eps>(Eps())  ist vom Typ   shared_ptr<Eps>

      Rückgabetyp ist shared_ptr<RE_>

      Beachte,  shared_ptr<RE_> ist verschieden von shared_ptr<Eps>!
      Wieso ist der Programmtext Typkorrekt!?

     Es gilt hier Ko-variantes Subtyping.

      1. Subtyping zwischen Basistype

      Aus der Klassendeklaration leiten wir ab  Eps <= RE_
      D.h ein Objekt der Klasse Eps ist auch
      vom Typ RE_


     2. Subtyping zwischen Typen(konstruktoren) mit Typargument, z.B. shared_ptr<....>

       shared_ptr<Eps> <= shared_ptr<RE_>  weil Eps <= RE_

       D.h. Subtyping Relation zwischen shared_ptr Instanzen richtet sich nach
       der Subtyping Relation zwischen den zugrunde liegenden Basistypen.

       Die Richtung von "<=" bleibt erhalten. Die Subtype Relation verhaelt sich hier ko-variant.

       Gibt es auch kontra-variant?
       Ja, betrachte Coercive Subtyping Beispiel.

int func(float x) {
    if (x < 1.0) return 1;
    return 0;
}


In mathematischer Schreibweise, func hat den Typ float -> int.
Es gilt aber auch

  float -> int  <=   int -> float

 weil int <= float

  In diesem Fall verhaelt sich Subtyping Kontra-variant im Argument und Ko-variant im Resultat.



 */

Pretty printing

Im Fall von + eliminiere Klammern.

Idee:

// --std=c++17
// Beispiel  regulärer Ausdrücke.
//
// R ::= x | R + R | R . R | R* | (R)
//
// Pretty printing



#include <iostream>
#include <string>
#include <memory>
#include <vector>
#include <optional>
using namespace std;


// Forward Referenzen + typedefs
class RE_;
typedef shared_ptr<RE_> RE;
RE mkEps();
RE mkLetter(char);
RE mkConc(RE,RE);
RE mkStar(RE);
RE mkAlts(vector<RE> rs);


class RE_ {
public:
  virtual string show() = 0;     // Quasi eine pretty-print Methode.

  virtual bool isAlts() { return false; }

  virtual RE convert() = 0;

  virtual optional<vector<RE>> getOperands() { return nullopt; }
};


// Epsilon, the empty word.
class Eps : public RE_ {
public:
  string show() { return "eps"; }
  RE convert() {
    return mkEps();
  }
};

// Alphabet symbols = letters.
class Letter : public RE_ {
  char x;
public:
  Letter(char y) { x = y; }
  string show() { return string(1,x); }
  RE convert() {
    return mkLetter(x);
  }
};

// Alternatives.
class Alt : public RE_ {
  RE left;
  RE right;
public:
  Alt(RE l, RE r) { left = l; right = r; }
  string show() {
    string s("(");
    s.append(left->show());
    s.append("+");
    s.append(right->show());
    s.append(")");
    return s;
  }

  RE convert() {
    auto r1 = left->convert();
    auto r2 = right->convert();

    if(r1->isAlts() && r2->isAlts()) {
      auto rs1 = (r1->getOperands()).value();
      auto rs2 = (r2->getOperands()).value();

      rs1.insert(rs1.end(), rs2.begin(), rs2.end());

      return mkAlts(rs1);
    } else if(r1->isAlts()) {

      auto rs1 = (r1->getOperands()).value();

      rs1.push_back(r2);

      return mkAlts(rs1);

    } else if(r2->isAlts()) {

      auto rs2 = (r2->getOperands()).value();

      rs2.insert(rs2.begin(),r1);

      return mkAlts(rs2);
    }
    // else
    vector<RE> ts;

    ts.push_back(r1);
    ts.push_back(r2);

    return mkAlts(ts);
  }
};


class Alts : public RE_ {
public:
  vector<RE> rs;
  Alts(vector<RE> ts) { rs = ts; }
  string show() {
    string s;
    if(rs.size() == 0)
      return s;

    s.append("(");
    for(int i=0; i < rs.size(); i++) {
      s.append(rs[i]->show());
      if(i < rs.size() - 1) {
    s.append("+");
      }
    }
    s.append(")");
    return s;
  }

  bool isAlts() {
    return true;
  }

  optional<vector<RE>> getOperands() {
    return make_optional<vector<RE>>(rs);
  }

  RE convert() {
    return mkAlts(rs);
  }

};



// Concatenation.
class Conc : public RE_ {
  RE left;
  RE right;
public:
  Conc(RE l, RE r) { left = l; right = r; }
  string show() {
    string s("(");
    s.append(left->show());
    s.append(".");
    s.append(right->show());
    s.append(")");
    return s;
  }
  RE convert() {
    return mkConc(left->convert(),
          right->convert());
  }
};

// Kleene star.
class Star : public RE_ {
  RE r;
public:
  Star(RE s) { r = s; }
  string show() {
    string s("(");
    s.append(r->show());
    s.append("*)");
    return s;
  }
  RE convert() {
    return mkStar(r->convert());
  }
};

// Helpers

RE mkEps() { return make_shared<Eps>(Eps()); }

RE mkLetter(char x) { return make_shared<Letter>(Letter(x)); }

RE mkAlt(RE l, RE r) { return make_shared<Alt>(Alt(l,r)); }

RE mkConc(RE l, RE r) { return make_shared<Conc>(Conc(l,r)); }

RE mkStar(RE r) { return make_shared<Star>(Star(r)); }

RE mkAlts(vector<RE> rs) { return make_shared<Alts>(Alts(rs)); }

void run(RE r) {
  cout << "\nTest\n" << r->show();
  cout << "\n" << r->convert()->show();
}

void test() {



  RE r1 = mkAlt(mkLetter('x'), mkLetter('y'));
  RE r2 = mkAlt(r1,r1);

  run(r2);

  RE r3 = mkConc(r1,mkLetter('y'));

  run(r3);

  RE r4 = mkAlt(r3,r2);

  run(r4);
}




int main() {

  test();
}

Top-down parser für reguläre Ausdrücke

// g++ --std=c++11

// Beispiel  regulärer Ausdrücke.
//
// R ::= x | R + R | R . R | R* | (R)
//
//   wobei x ein KLEIN Buchstaben ist.



#include <iostream>
#include <string>
#include <memory>
#include <vector>
using namespace std;


// AST Darstellung.

class RE_ {
public:
  virtual string show() = 0;     // Quasi eine pretty-print Methode.
};

// Zwecks virtueller Methodenauswahl, müssen Objekte via Referenz erreichbar sein.
// Wir verwenden "smart pointers".
typedef shared_ptr<RE_> RE;                            // TEMPLATE


// Epsilon, the empty word.
class Eps : public RE_ {
public:
  string show() { return "eps"; }
};

// Alphabet symbols = letters.
class Letter : public RE_ {
  char x;
public:
  Letter(char y) { x = y; }
  string show() { return string(1,x); }
};

// Alternatives.
class Alt : public RE_ {
  RE left;
  RE right;
public:
  Alt(RE l, RE r) { left = l; right = r; }
  string show() {
    string s("(");
    s.append(left->show());                          // OVERLOADING
    s.append("+");
    s.append(right->show());
    s.append(")");
    return s;
  }
};


// Concatenation.
class Conc : public RE_ {
  RE left;
  RE right;
public:
  Conc(RE l, RE r) { left = l; right = r; }
  string show() {
    string s("(");
    s.append(left->show());
    s.append(".");
    s.append(right->show());
    s.append(")");
    return s;
  }
};

// Kleene star.
class Star : public RE_ {
  RE r;
public:
  Star(RE s) { r = s; }
  string show() {
    string s("(");
    s.append(r->show());
    s.append("*)");
    return s;
  }
};

// Helpers

// Point to note.
// Co-variant subtyping here!
// shared_ptr<Eps> <= shared_part<RE_>.
RE mkEps() { return make_shared<Eps>(Eps()); }                     // SUBTYPING

RE mkLetter(char x) { return make_shared<Letter>(Letter(x)); }

RE mkAlt(RE l, RE r) { return make_shared<Alt>(Alt(l,r)); }

RE mkConc(RE l, RE r) { return make_shared<Conc>(Conc(l,r)); }

RE mkStar(RE r) { return make_shared<Star>(Star(r)); }


/////////////////////////////////////
// Tokenizer

typedef enum {
  EOS,           // End of string
  LETTER,
  OPEN,
  CLOSE,
  CONC,
  ALT,
  KLEENE,
} Token_t;


class Token {
public:
  Token() {}
  Token(Token_t tt) {
    kind = tt;
  }
  Token(Token_t tt, char v) {
    kind = tt; val = v;
  }
  bool eos() {
    return kind == EOS;
  }
  Token_t kind;
  char val;
} ;



class Tokenize {
  string s;
  int pos;
public:
  Tokenize(string s) {
         this->s = s;
     pos = 0;
  }

  Token next() {
    if(s.length() <= pos)
      return Token(EOS);

    while(1) {

       if(s.length() <= pos)
         return Token(EOS);

       switch(s[pos]) {
       case '(': pos++;
     return Token(OPEN);
       case ')': pos++;
     return Token(CLOSE);
       case '+': pos++;
     return Token(ALT);
       case '.': pos++;
     return Token(CONC);
       case '*': pos++;
     return Token(KLEENE);
       case ' ':
     pos++;
     break;
       default:  char ch = s[pos];
             if(ch >= 'a' && ch <= 'z') {
                     pos++;
             return Token(LETTER, ch);
         } else {
           cout << "Unerlaubtes Symbol";
           exit(1);
         }
       }
    }
  } // next

  vector<Token> scan() {
    vector<Token> v;
    Token t;

    do {
      t = next();
      v.push_back(t);
    }
    while(! t.eos());

    return v;
  } // scan
};

// Wrapper class, provide the (current) token.
class Tokenizer : Tokenize {
public:
  Token token;
  Tokenizer(string s) : Tokenize(s) { token = next(); }
  void nextToken() {
    token = next();
  }
};


/////////////////////////////////////
// Top-Down Parser fuer
//
//  R ::= x | R + R | R . R | R* | (R)

//  R ::= T R2
//  R2 ::= . T R2 |
//  T ::= F T2
//  T2 ::= + F T2 |
//  F ::= x | x* | (R) | (R)*

// Beachte.
// "+" bindet staerker als "."
// "Kleene star" wird betrachtet im Fall von "letter" und geklammerten Ausdruecken.
// Deshalb bindet "*" staerker als die anderen Operatoren.


// Newer C++ versions come with "optional".
// We simply build our own and follow the original idea as found in Haskell's Maybe
template<typename T>
class Optional {
  bool b;
  T val;
public:
  Optional() : b(false) {}
  Optional(T v) : val(v), b(true) {}
  bool isJust() { return b; }
  bool isNothing() { return !b; }
  T fromJust() { return val; }
};

template<typename T>
Optional<T> nothing() {
  return Optional<T>();
}

template<typename T>
Optional<T> just(T v) {
  return Optional<T>(v);
}


// Functional style top-down parser.

Optional<RE> parseR(Tokenizer &t);
Optional<RE> parseR2(Tokenizer &t, RE left);
Optional<RE> parseT(Tokenizer &t);
Optional<RE> parseT2(Tokenizer &t, RE left);
Optional<RE> parseF(Tokenizer &t);


//  R ::= T R2
Optional<RE> parseR(Tokenizer &t) {
  Optional<RE> left = parseT(t);
  if(left.isNothing())
    return left;

  return parseR2(t,left.fromJust());
}

//  R2 ::= . T R2 |
Optional<RE> parseR2(Tokenizer &t, RE left) {

    if(t.token.kind == CONC) {
        t.nextToken();

    Optional<RE> right = parseT(t);
    if(right.isNothing())
      return right;

    return parseR2(t, mkConc(left, right.fromJust()));
    }

    return just<RE>(left);
}

//  T ::= F T2
Optional<RE> parseT(Tokenizer &t) {
  Optional<RE> left = parseF(t);
  if(left.isNothing())
    return left;

  return parseT2(t,left.fromJust());
}

// T2 ::= + F T2 |
Optional<RE> parseT2(Tokenizer &t, RE left) {

    if(t.token.kind == ALT) {
        t.nextToken();

    Optional<RE> right = parseF(t);
    if(right.isNothing())
      return right;

    return parseT2(t, mkAlt(left, right.fromJust()));
    }

    return just<RE>(left);
}

//  F ::= x | x* | (R) | (R)*
Optional<RE> parseF(Tokenizer &t) {
  switch(t.token.kind) {
  case LETTER: {
    char ch = t.token.val;
    t.nextToken();

    if(t.token.kind == KLEENE) {
      t.nextToken();
      return just<RE>(mkStar(mkLetter(ch)));
    }
    return just<RE>(mkLetter(ch));
  }
  case OPEN: {
    t.nextToken();
    Optional<RE> re = parseR(t);
    if (re.isNothing())
      return re;

    if (t.token.kind != CLOSE)
      return nothing<RE>();

    t.nextToken();

    if(t.token.kind == KLEENE) {
      t.nextToken();
      re = mkStar(re.fromJust());
    }
    return re;
  }

  default:
    return nothing<RE>();
  } // switch
}



// Beispiele


void display(Optional<RE> r) {
  if(r.isNothing()) {
    cout << "nothing \n";
  } else {
    cout << (r.fromJust())->show() << "\n";
  }
  return;
}


void parse(string s) {
  Tokenizer t = Tokenizer(s);

  auto r = parseR(t);

  // Whole input shall be consumed.
  if (t.token.kind == EOS) {
    display(r);
    }
  else {
    display(nothing<RE>());
  }

}


void test() {

  // (x + eps) . (y*)
  RE r = mkConc(mkAlt(mkLetter('x'), mkEps()), mkStar(mkLetter('y')));

  cout << "\n" << r->show();

}



int main() {

  parse("x.y+z");
  parse("x* + y*");
  parse("x + y *");
  parse("(x+y)*");

  parse("x x");

}