perl2xpr.cpp 6.5 KB
///////////////////////////////////////////////////////////////////////////////
// perl2xpr.cpp
//      A utility for translating a Perl regular expression into an
//      xpressive static regular expression.
//
//  Copyright 2007 Eric Niebler. Distributed under the Boost
//  Software License, Version 1.0. (See accompanying file
//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#include <stack>
#include <string>
#include <iostream>
#include <boost/xpressive/xpressive_static.hpp>
#include <boost/xpressive/regex_actions.hpp>

namespace x = boost::xpressive;
using namespace x;

int main(int argc, char *argv[])
{
    int i = 1, j = 1;
    bool nocase = false;
    char const *dot = " ~_n ";
    char const *bos = " bos ";
    char const *eos = " eos ";

    for(; i < argc && '-' == *argv[i]; argv[i][++j]? 0: (j=1,++i))
    {
        switch(argv[i][j])
        {
        case 'i':           // perl /i modifier
            nocase = true;
            break;
        case 's':           // perl /s modifier
            dot = " _ ";
            break;
        case 'm':           // perl /m modifier
            bos = " bol ";
            eos = " eol ";
            break;
        default:
            std::cerr << "Unknown option : " << argv[i] << std::endl;
            return -1;
        }
    }

    if(i == argc)
    {
        std::cerr << "Usage:\n    perl2xpr [-i] [-s] [-m] 're'\n";
        return -1;
    }

    // Local variables used by the semantic actions below
    local<int> mark_nbr;
    local<std::string> tmp;
    local<std::stack<std::string> > strings;

    // The rules in the dynamic regex grammar
    cregex regex, alts, seq, quant, repeat, atom, escape, group, lit, charset, setelem;

    lit     = ~(set='.','^','$','*','+','?','(',')','{','}','[',']','\\','|')
            ;

    escape  = as_xpr('b')               [top(strings) += " _b "]
            | as_xpr('B')               [top(strings) += " ~_b "]
            | as_xpr('d')               [top(strings) += " _d "]
            | as_xpr('D')               [top(strings) += " ~_d "]
            | as_xpr('s')               [top(strings) += " _s "]
            | as_xpr('S')               [top(strings) += " ~_s "]
            | as_xpr('w')               [top(strings) += " _w "]
            | as_xpr('W')               [top(strings) += " ~_w "]
            | _d                        [top(strings) += " s" + _ + " "]
            | _                         [top(strings) += " as_xpr('" + _ + "') "]
            ;

    group   = (
                  as_xpr("?:")          [top(strings) += " ( "]
                | as_xpr("?i:")         [top(strings) += " icase( "]
                | as_xpr("?>")          [top(strings) += " keep( "]
                | as_xpr("?=")          [top(strings) += " before( "]
                | as_xpr("?!")          [top(strings) += " ~before( "]
                | as_xpr("?<=")         [top(strings) += " after( "]
                | as_xpr("?<!")         [top(strings) += " ~after( "]
                | nil                   [top(strings) += " ( s" + as<std::string>(++mark_nbr) + "= "]
              )
            >> x::ref(regex)
            >> as_xpr(')')              [top(strings) += " ) "]
            ;

    setelem = as_xpr('\\') >> _         [top(strings) += " as_xpr('" + _ + "') "]
            | "[:" >> !as_xpr('^')      [top(strings) += "~"]
                >> (+_w)                [top(strings) += _ ]
                >> ":]"
            | (
                   (s1=~as_xpr(']')) 
                >> '-'
                >> (s2=~as_xpr(']'))
              )                         [top(strings) += "range('" + s1 + "','" + s2 + "')"]
            ;

    charset = !as_xpr('^')              [top(strings) += " ~ "]
            >> nil                      [top(strings) += " set[ "]
            >> (
                    setelem
                  | (~as_xpr(']'))      [top(strings) += " as_xpr('" + _ + "') "]
               )
            >>*(
                    nil                 [top(strings) += " | "]
                 >> (
                        setelem
                      | (~as_xpr(']'))  [top(strings) += "'" + _ + "'"]
                    )
               )
            >> as_xpr(']')              [top(strings) += " ] "]
            ;

    atom    = (
                  +(lit >> ~before((set='*','+','?','{')))
                | lit
              )                         [top(strings) += " as_xpr(\"" + _ + "\") "]
            | as_xpr('.')               [top(strings) += dot]
            | as_xpr('^')               [top(strings) += bos]
            | as_xpr('$')               [top(strings) += eos]
            | '\\' >> escape
            | '(' >> group
            | '[' >> charset
            ;

    repeat  = as_xpr('{')               [tmp = " repeat<"]
            >> (+_d)                    [tmp += _]
            >> !(
                    as_xpr(',')         [tmp += ","]
                 >> (
                        (+_d)           [tmp += _]
                      | nil             [tmp += "inf"]
                    )
                )
            >> as_xpr('}')              [top(strings) = tmp + ">( " + top(strings) + " ) "]
            ;

    quant   = nil                       [push(strings, "")]
            >> atom
            >> !(
                    (
                        as_xpr("*")     [insert(top(strings), 0, " * ")] // [strings->*top()->*insert(0, " * ")]
                      | as_xpr("+")     [insert(top(strings), 0, " + ")] // [strings->*top()->*insert(0, " + ")]
                      | as_xpr("?")     [insert(top(strings), 0, " ! ")] // [strings->*top()->*insert(0, " ! ")]
                      | repeat
                    )
                 >> !as_xpr('?')        [insert(top(strings), 0, " - ")]
                )
            >> nil                      [tmp = top(strings), pop(strings), top(strings) += tmp]
            ;

    seq     = quant
            >> *(
                    nil                 [top(strings) += " >> "]
                 >> quant
                )
            ;

    alts    = seq
            >> *(
                    as_xpr('|')         [top(strings) += " | "]
                 >> seq
                )
            ;

    regex   = alts
            ;

    strings.get().push("");
    if(!regex_match(argv[i], regex))
    {
        std::cerr << "ERROR: unrecognized regular expression" << std::endl;
        return -1;
    }
    else if(nocase)
    {
        std::cout << "icase( " << strings.get().top() << " )" << std::endl;
    }
    else
    {
        std::cout << strings.get().top() << std::endl;
    }

    return 0;
}