【编译原理】【词法分析】【正则表达式】【NFA】【DFA】【C++】正则表达式转DFA&NFA，判断字符串是否符合正则表达式的匹配算法

这篇具有很好参考价值的文章主要介绍了【编译原理】【词法分析】【正则表达式】【NFA】【DFA】【C++】正则表达式转DFA&NFA，判断字符串是否符合正则表达式的匹配算法。希望对大家有所帮助。如果存在错误或未考虑完全的地方，请大家不吝赐教，您也可以点击"举报违法"按钮提交疑问。

正则表达式转DFA&NFA

显然，正则表达式、NFA、DFA的概念都很简单，所以直接上代码，注释应该解释地比较清楚，没有万能头文件的自行替换需求库，如果有疑问的可以留言。

网盘链接
[自行补全]/s/1pbGT_wpB662TwFrnukXgGQ?pwd=TSIT
提取码：TSIT

运行截图

【编译原理】【词法分析】【正则表达式】【NFA】【DFA】【C++】正则表达式转DFA&NFA，判断字符串是否符合正则表达式的匹配算法

原理可以参考这篇博客传送门

本次程序由四个文件组成文章来源地址https://www.toymoban.com/news/detail-506258.html

文件名	描述
Regular_Expression.cpp	处理正则表达式
NFA.cpp	正则表达式转NFA
DFA.cpp	NFA转DFA
MainProcess.cpp	主要处理程序

//Regular_Expression.cpp
#ifndef __REGULAR_EXPRESSION__
#define __REGULAR_EXPRESSION__

#include<bits/stdc++.h>
using namespace std;


const int OPERANDS = 1; // 操作符
const int OPERATOR = 2; // 运算符
const int ILLEGAL_CHAR = -1;  // 非法字符

// 正则表达式包含非法字符
class Regular_Expression_Input_Exception
{   
    public:
    char c;
    Regular_Expression_Input_Exception(char c)
    {
        this->c = c;
    }
    string what()const
    {   
        string ret;
        ret = c + " is not in the charset" ;
        return ret;
    }
};

// 正则表达式格式错误
class Regular_Expression_Format_Exceprion
{
    public:
    Regular_Expression_Format_Exceprion(){}
    string what()const
    {
        string ret;
        ret = "Format Error ";
        return ret;
    }
};

class Regular_Expression
{   
    public:

    Regular_Expression(string & s)
    {
        expression = s;
        expression = Regular_Expression_Pretreatment(expression);
        expression = Regular_Expression_Infix2PostFix(expression);
    }

    string get_postfix()
    {
        return expression;
    }

    private:
    string expression;
    inline string Regular_Expression_Pretreatment(string&);
    inline string Regular_Expression_Infix2PostFix(string&);
};

// 返回字符类型
int get_chartype(char c)
{
    if  (   ('a' <= c && c <= 'z')||
            ('A' <= c && c <= 'Z')||
            ('0' <= c && c <= '9')||
            (c == '.')
        ) return OPERANDS;

    if  (   (c == '|')  ||  
            (c == '*')  ||
            (c == '(')  ||  
            (c == ')')  || 
            (c == '&')
        ) return OPERATOR;
    return ILLEGAL_CHAR;
}

// 设置操作符优先级
int get_priorioty(char c)
{
    switch (c)
    {
        case '*':   return 3;   break;
        case '&':   return 2;   break;
        case '|':   return 1;   break;
        case '(':   return 0;   break;
        default:    return ILLEGAL_CHAR;    break;
    }
}

//检查并预处理正则表达式 添加&作为连接符
string Regular_Expression::Regular_Expression_Pretreatment(string & pre_expression)
{
    
    assert(pre_expression.size() > 0);

    string treated_expression;
    try
    {   
        char prechar ;
        int pretype ;
        char nchar = pre_expression[0];
        int ntype = get_chartype(nchar);

        if(ntype == ILLEGAL_CHAR) throw Regular_Expression_Input_Exception(nchar);

        treated_expression.push_back(nchar);
        prechar = nchar;
        pretype = ntype;

        int len = pre_expression.length();

        for(int i = 1 ; i<len ; ++i)
        {
            nchar = pre_expression[i];
            ntype = get_chartype(nchar);

            if(ntype == ILLEGAL_CHAR) throw Regular_Expression_Input_Exception(nchar);

            // 当第一位是操作数 , * , ）且第二位为操作数或（
            if( (pretype == OPERANDS || prechar == '*' || prechar == ')'  ) && 
                (ntype == OPERANDS || nchar == '(')
                ) 
            // 使用 & 作为连接符
            treated_expression.push_back('&');
            treated_expression.push_back(nchar);

            pretype = ntype;
            prechar = nchar;
        }

        return treated_expression;

    }
    catch(const Regular_Expression_Input_Exception& e)
    {
        std::cerr << e.what() << "\n";
        exit(0);
    }
    catch(const exception& e)
    {
        std::cerr << e.what() << "\n";
        exit(0);
    }

    return treated_expression;
}

// 正则表达式的中缀表达式转成后缀表达式
string Regular_Expression::Regular_Expression_Infix2PostFix(string & pre_expression)
{

    assert(pre_expression.size() > 0);

    string treated_expression;

    try
    {
        vector<char> op;

        int len = pre_expression.length();        

        int ntype ;
        char nchar;

        for(int i=0 ; i<len ; ++i)
        {
            nchar = pre_expression[i];
            ntype = get_chartype(nchar);

            if(ntype == ILLEGAL_CHAR) throw Regular_Expression_Input_Exception(nchar);

            // 遇到 （ 直接压入栈中；
            if(nchar == '(')
            {
                op.push_back(nchar);
            }
            
            // 遇到 ） 将运算符出栈并输出，直到遇到 （ ，将 （ 出栈但不输出
            else if(nchar == ')')
            {
                while(!op.empty() && op.back() != '(')
                {
                    treated_expression.push_back(op.back());
                    op.pop_back();
                }

                // 没有左括号匹配
                if(op.empty()) throw Regular_Expression_Format_Exceprion();

                // 弹出左括号
                op.pop_back();
            }
            /*  遇到 * 、 & 、 | 运算符：
                如果栈为空，直接将运算符压入栈中；
                如果栈不为空，弹出栈中优先级大于等于当前运算符的运算符并输出，再将当前运算符入栈。
            */
            else if(ntype == OPERATOR)
            {   
                int npriority = get_priorioty(nchar);
                while(!op.empty() && get_priorioty(op.back()) >= npriority)
                {
                    treated_expression.push_back(op.back());
                    op.pop_back();
                }
                op.push_back(nchar);
            }

            else if(ntype == OPERANDS)
            {
                treated_expression.push_back(nchar);
            }
        }

        while(!op.empty())
        {
            nchar = op.back();
            
            // 缺少右括号匹配
            if(nchar == '(') throw Regular_Expression_Format_Exceprion();

            treated_expression.push_back(nchar);
            op.pop_back();
        }
        
        return treated_expression;
    }
    
    catch(const Regular_Expression_Input_Exception& e)
    {
        std::cerr << e.what() << "\n";
        exit(0);
    }
    catch(const Regular_Expression_Format_Exceprion& e)
    {
        std::cerr << e.what() << "\n";
        exit(0);
    }
    catch(const std::exception& e)
    {
        std::cerr << e.what() << '\n';
        exit(0);
    }

    return treated_expression;   
}



// #ifndef __MAINPROCESS__
// int main()
// {
//     #ifdef __LOCAL
//     freopen("usrin.txt", "r", stdin);
//     freopen("usrout.txt", "w", stdout);
//     #endif

//     string str;
//     cin >> str;
//     Regular_Expression reg(str);
//     cout << reg.get_postfix() << "\n";
// }
// #endif

#endif

//NFA.cpp
#ifndef __NFA__
#define __NFA__

#include "Regular_Expression.cpp"
#include<bits/stdc++.h>
using namespace std;
struct Nfastate
{   

    // '#' 视为空转移
    Nfastate(char c = '#' , Nfastate* trans = NULL , vector<Nfastate*> vec = vector<Nfastate*>())
    {
        input = c;
        chTrans = trans;
        epTrans.clear();
        for(auto x:vec) epTrans.insert(x);
    }

    void add_New_epTrans(Nfastate& n)
    {
        epTrans.insert(&n);
    }

    void add_New_epTrans(Nfastate* n)
    {
        epTrans.insert(n);
    }

    void add_new_epTrans(vector<Nfastate*> &vec)
    {
        for(auto x:vec) epTrans.insert(x);
    }

    void print();
    set<Nfastate*> closure();
    char input;                 // 转移字符
    Nfastate* chTrans;          // 转移到的状态
    set<Nfastate*> epTrans;     //  空转移状态集合
};

set<Nfastate*> Nfastate::closure()
{
    set<Nfastate*> s;
    
    s.insert(this);

    bool hasnewnode = 1;

    while(hasnewnode)
    {
        hasnewnode = 0;
        for(auto x:epTrans)
        {
            if(s.find(x) == s.end())
            {
                hasnewnode = 1;
                s.insert(x);
            }
        }
    }

    return s;

}

void Nfastate::print()
{
    cerr << "------Nfastate---------\n";
    cerr << "input = " << input << "\n";
    cerr << "chTrans = " << chTrans << "\n";
    cerr << "epTrans = {" ;
    for(auto x:epTrans) cerr << x << " ";
    cerr << "}\n---------------------------------\n";
}

struct subNFA
{
    subNFA(Nfastate* phead = NULL , Nfastate* ptail = NULL)
    {
        head = phead;
        tail = ptail;
    }
    subNFA(Nfastate& ohead , Nfastate & otail)
    {
        head = &ohead;
        tail = &otail;
    }

    subNFA(char c)
    {
        Nfastate *ntail = new Nfastate;
        Nfastate *nhead = new Nfastate(c , ntail);


        // fix1
        if(c == '#') nhead->chTrans = NULL;

        head = nhead;
        tail = ntail;
    }

    void operator=(const subNFA& ele)
    {
        head = ele.head;
        tail = ele.tail;
    }

    void print();
    Nfastate *head;             // 头部指针
    Nfastate *tail;             //  尾部指针
};


void subNFA::print()
{
    int nodecnt = 0;
    map<Nfastate* , int> mp;



    queue<Nfastate*> q;
    q.push(head);

    mp[head] = ++nodecnt;

    auto addnode = [&](Nfastate* e)
    {
        if(mp.count(e)) return mp[e];
        mp[e] = ++nodecnt;
        q.push(e);
        return mp[e];
    };

    while(!q.empty())
    {
        Nfastate* now = q.front();
        q.pop();


        std::cout << "node : " << mp[now] << "\n";
        if(now->chTrans != NULL) 
        {
            addnode(now->chTrans);
            std::cout << "    chTrans : " << now->input << "\n";
            std::cout << "    Transnode : " << mp[now->chTrans] << "\n";
        }
        if(!now->epTrans.empty())
            std:: cout <<"    epTrans : {";

        bool first = 1;
        for(auto x:now->epTrans)
        {
            addnode(x);
            std::cout << (first?"":" , ") << mp[x] ; 
            first = 0;
        }
        if(!now->epTrans.empty())
            std::cout << "}\n";
        
        cout << "\n";
    }

    std::cout << "Start node: "<< mp[head] << "\n";
    std::cout << "End node : " << mp[tail] << "\n";
}

class NFA
{
    public:
    NFA(string &expression):reg(expression),subnfa()
    {
        // cerr << reg.get_postfix() << "\n";
        Build_NFA();
    }   

    void print();
    void Build_NFA();
    Nfastate* gethead();
    Nfastate* gettail();
    private:
    subNFA subnfa;              // 最终的NFA
    Regular_Expression reg;     // 正则表达式转换
                 
};

Nfastate* NFA::gethead(){return subnfa.head;}
Nfastate* NFA::gettail(){return subnfa.tail;}

void NFA::print()
{   
    //debug
    // std::cout << nfa.head << " " << nfa.tail << "\n"; 
    std::cout << reg.get_postfix() << "\n";
    subnfa.print();
}

void NFA::Build_NFA()
{
    string expression = reg.get_postfix();
    vector<subNFA> sta_nfa;     // nfa栈

    try
    {
        int len = expression.length();
        for(int i=0 ; i<len ; ++i)
        {
            char nchar = expression[i];
            int ntype = get_chartype(nchar);

            //如果遇到操作数a，则新建一个NFA，转换弧上的值为a，将这个NFA压入栈中
            if(ntype == OPERANDS)
            {
                subNFA n(nchar);
                sta_nfa.push_back(n);

                //debug
                // cerr << nchar << ' ' << n.head->input << "\n";
                // cerr << n.head << " " << n.tail << "\n";
                // n.head->print();
                // n.tail->print();
            }
            else if(nchar == '*')
            {
                // 取不出一个元素
                if(sta_nfa.size() < 1) throw Regular_Expression_Format_Exceprion();
                subNFA n1 = sta_nfa.back();
                sta_nfa.pop_back();

                subNFA n('#');

                (*n.head).add_New_epTrans(n1.head);
                (*n1.tail).add_New_epTrans(n.tail);
                (*n1.tail).add_New_epTrans(n.head);

                sta_nfa.push_back(n);

            }
            else if(nchar == '|')
            {
                // 取不出两个元素
                if(sta_nfa.size() < 2) throw Regular_Expression_Format_Exceprion();

                subNFA n1 = sta_nfa.back();
                sta_nfa.pop_back();
                subNFA n2 = sta_nfa.back();
                sta_nfa.pop_back();

                subNFA n('#');
                
                //debug
                // n.head->print();
                // n.tail->print();


                (*n.head).add_New_epTrans(n1.head);
                (*n.head).add_New_epTrans(n2.head);
                (*n1.tail).add_New_epTrans(n.tail);
                (*n2.tail).add_New_epTrans(n.tail);


                //debug
                // n.head->print();
                // n.tail->print();

                sta_nfa.push_back(n);

            }
            else if(nchar == '&')
            {
                // 取不出两个元素
                if(sta_nfa.size() < 2) throw Regular_Expression_Format_Exceprion();
                subNFA n2 = sta_nfa.back();
                sta_nfa.pop_back();
                subNFA n1 = sta_nfa.back();
                sta_nfa.pop_back();

                (*n1.tail).add_New_epTrans(n2.head);
                n1.tail = n2.tail;

                sta_nfa.push_back(n1);

            }
            else throw Regular_Expression_Format_Exceprion();
        }
        if(sta_nfa.size() != 1) throw Regular_Expression_Format_Exceprion();

        subnfa = sta_nfa[0];
    }

    catch(const Regular_Expression_Input_Exception& e)
    {
        std::cerr << e.what() << "\n";
        exit(0);
    }
    catch(const Regular_Expression_Format_Exceprion& e)
    {
        std::cerr << e.what() << "\n";
        exit(0);
    }
    catch(const std::exception& e)
    {
        std::cerr << e.what() << '\n';
        exit(0);
    }
    

}


// #ifndef __MAINPROCESS__
// int main()
// {
//     #ifdef __LOCAL
//     freopen("usrin.txt", "r", stdin);
//     freopen("usrout.txt", "w", stdout);
//     #endif

//     string str;
//     cin >> str;

//     NFA nfa(str);
//     nfa.print();
// }
// #endif


#endif

//DFA.cpp
#ifndef __DFA__
#define __DFA__

#include <bits/stdc++.h>
#include "NFA.cpp"
using namespace std;


struct Dfastate
{   
    // 获取对于字符c的闭包
    Dfastate move(char c , Nfastate* tail)
    {
        Dfastate ret;
        for(auto x:closure)
        {
            if(x->input == c) ret.closure.insert(x->chTrans);
        }
        ret.n_closure(tail);

        return ret;
    }

    void n_closure(Nfastate*);
    void upGrade_End_info(Nfastate*);
    bool isEnd;                 //判断是否是结束状态
    set<Nfastate*> closure;     //闭包
    void print();
    bool operator<(const Dfastate& b) const;
    bool operator>(const Dfastate& b) const;
    bool operator==(const Dfastate& b) const;
};

void Dfastate::print()
{
    cerr << "----------Dfastate-------------\n";
    for(auto x:closure) cerr << x << " "; cerr << "\n";
    cerr << (isEnd ? "YES" : "NO") << "\n";
    cerr << "-------------------------------\n\n";
}

bool Dfastate::operator<(const Dfastate& b)const
{   
    //debug
    // cerr << " < \n";
    return closure < b.closure;
}
bool Dfastate::operator>(const Dfastate& b)const
{
    //debug
    // cerr << " > \n";
    return closure > b.closure;
}
bool Dfastate::operator==(const Dfastate& b)const
{
    //debug
    // cerr << " == \n";
    return closure == b.closure;
}


void Dfastate::upGrade_End_info(Nfastate* tail)
{
    if(closure.find(tail) != closure.end()) isEnd = true;
    else isEnd = false;
}

void Dfastate::n_closure(Nfastate * tail)
{
    bool hasnewnode = 1;

    while(hasnewnode)
    {
        hasnewnode = 0;
        for(auto x:closure)
        {

            for(auto y:x->epTrans)
            {
                if(closure.find(y) == closure.end())
                {
                    hasnewnode = 1;
                    closure.insert(y);
                }
            }
            
        }
    }

    upGrade_End_info(tail);
}

struct subDFA
{
    bool isEnd;                 //是否是结束状态
    map<int , subDFA*> to;      //转移图
};

class DFA
{   
    public:
    DFA(string &expression)
    {
        // expression = str;
        for(auto x:expression)
        {
            int ntype = get_chartype(x);
            if(ntype == OPERANDS) charset.insert(x);
            if(ntype == ILLEGAL_CHAR) 
            {
                cerr << "ILLEGAL CHARACHTER\n";
                exit(0);
            }
        }
        Build_DFA(expression);

        //debug
        // for(auto x:charset) 
        // {
        //     cerr << x << " ";
        // }
    }
    void print();
    bool match(string&);
    private:
    void Build_DFA(string&);
    set<char> charset;          //用到的字符集
    subDFA* dfahead;               //DFA的初始状态
    // string expression;
};

bool DFA::match(string& matchstr)
{   
    subDFA* nowp = dfahead;

    if(nowp == NULL) return false;

    for(auto ch:matchstr)
    {   
        if(!nowp->to.count(ch)) return false;
        nowp = nowp->to[ch];
    }
    
    return nowp->isEnd;
}

void DFA::print()
{
    int nodecnt = 0;
    map<subDFA* , int> mp;

    queue<subDFA*> q;
    q.push(dfahead);
    mp[dfahead] = ++nodecnt;

    while(!q.empty())
    {
        subDFA * now = q.front();
        q.pop();

        for(auto next:now->to)
        {
            if(mp.count(next.second)) continue;
            else
            {
                mp[next.second] = ++nodecnt;
                q.push(next.second);
            }
        }
    }

    for(auto x:mp)
    {
        cout << "node[" << x.second <<"] : ";
        for(auto y:x.first->to)
        {
            cout << "(" << char(y.first) << " , " << mp[y.second] << ") ";
        }
        cout << "isEnd : " << ( (x.first->isEnd) ? "YES" : "NO" );
        cout << "\n";
    }
}

void DFA::Build_DFA(string & expression)
{
    NFA nfa(expression);
    Nfastate* head = nfa.gethead();
    Nfastate* tail = nfa.gettail();


    map<Dfastate , subDFA*> mp_Dfastate_PsubDFA;
    queue<Dfastate> q_Dfastate;

    // 获取初始状态的闭包
    Dfastate T;
    T.closure.insert(head);


    //debug
    // T.print();

    T.n_closure(tail);

    // T.print();

    //初始化map queue 以及转移的初始状态
    mp_Dfastate_PsubDFA[T] = new subDFA;
    mp_Dfastate_PsubDFA[T]->isEnd = T.isEnd;

    q_Dfastate.push(T);
    dfahead = mp_Dfastate_PsubDFA[T];

    while(!q_Dfastate.empty() )
    {
        T = q_Dfastate.front();
        q_Dfastate.pop();
        
        //debug
        // T.print();

        subDFA* now = mp_Dfastate_PsubDFA[T];

        for(auto ch:charset)
        {
            Dfastate next = T.move(ch , tail);

            //debug
            // cerr << "char = " << ch << "\n";
            // next.print();

            if(next.closure.empty()) continue;
            
            //状态没出现过就新建节点并加入队列
            if(!mp_Dfastate_PsubDFA.count(next))
            {
                mp_Dfastate_PsubDFA[next] = new subDFA;
                mp_Dfastate_PsubDFA[next]->isEnd = next.isEnd;
                q_Dfastate.push(next);
            }

            now->to[ch] = mp_Dfastate_PsubDFA[next];
        }
    }

    //debug
    // cerr << mp_Dfastate_PsubDFA.size() << "\n";
} 

const string Digit =  "(0|1|2|3|4|5|6|7|8|9)" ;
const string Pos_Digit = "(1|2|3|4|5|6|7|8|9)";

// #ifndef __MAINPROCESS__
// int main()
// {
//     #ifdef __LOCAL
//     freopen("usrin.txt", "r", stdin);
//     freopen("usrout.txt", "w", stdout);
//     #endif

//     string str;
    
//     string interger = "(0|" + Pos_Digit + Digit + "*)";
//     string decimal = "(0|" + Digit + "*" + Pos_Digit + ")";

//     string pat = "(" + interger + ")|(" + interger + "." + decimal + ")"; 
//     cout << pat  << "\n";

//     DFA dfa(pat);
//     dfa.print();

//     cin >> str;
//     cout << dfa.match(str);
// }
// #endif

#endif

//Main_Process.cpp
#ifndef __MAINPRECESS__
#define __MAINPROCESS__

#include<bits/stdc++.h>
#include "DFA.cpp"
using namespace std;

int main()
{

    #ifdef __LOCAL
    freopen("usrin.txt", "r", stdin);
    freopen("usrout.txt", "w", stdout);
    #endif

    string str;
    
    string interger = "(0|" + Pos_Digit + Digit + "*)";
    string decimal = "(0|" + Digit + "*" + Pos_Digit + ")";

    string pat = "(" + interger + ")|(" + interger + "." + decimal + ")"; 

    // pat = "(eedd";
    
    // pat 为正规式
    // str 为待检测的字符串
    pat = "(a|b)*acd*(c|d)";
    str = "aacdc";

    cout <<"Regular_Expression : " <<  pat  << "\n";
    // cin >> str;
    cout << "MatchString : " << str << "\n\n";

    // 构建DFA
    DFA dfa(pat);

    cout << "DFA GRAPH EXPRESSION:\n";

    // 输出DFA
    dfa.print();
    cout << "\n";

    //判断是否符合
    cout << (dfa.match(str) ? "Legal\n" : "Illegal\n");
}

#endif