C++实现高效字符串查找算法

最近想到一个关于高效字符串查找算法的设想,然后果断实现之,算法基于哈希表,用于源字符串特别长的情况,查找的子字符串越长、越没规律,那么速度越快。可能已经有人做过,不过我撸代码前还没听说过类似算法,算是一种轮子吧。
基本实现的思路是:首先建立一个hash_map,然后将子字符串所有字符及位置录入字符串中,如下图所示:
20160401212342
对于需要查找的字符串(比如在很长的字符串文本中查找“abcdefga”这一串字符),构建如上所示哈希表,键值名为子字符串出现的字符,值为出现的位置。
构建好之后呢,就好玩了,我只说说正向查找原理,逆向查找类似。首先,来一个假设,我就假设源字符串为“abababcdefgaaaaa”这样吧,第一次,取源字符串中,(子字符串长度-1)这个位置的字符,值为d,然后取哈希表的值,为3,那么,将源字符串中7-3的位置开始,与子字符串相比较,比较结果较满意,第一次就查找成功,那么直接返回7-3=4。

实现代码如下,longstr.hpp:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#pragma once
#ifndef _LONGSTR_HPP_
#define _LONGSTR_HPP_
 
#include <string>
#include <cstring>
#include <hash_map>
 
template<typename charT>
class longstr {
protected:
    //获取目标字符串哈希表
    static std::hash_map<charT, int> get_hm (std::basic_string<charT>& strfind) {
        int sflen = strfind.length ();
        std::hash_map<charT, int> hm;
        //hm.reserve (256);
        for (int i = 0; i < sflen; i++) hm.insert (pair<char, int> (strfind [i], i));
        return hm;
    }
 
    //正向查找算法实现
    static int _base_find (std::basic_string<charT>& src, std::basic_string<charT>& strfind, std::hash_map<charT, int>& hm, int pos) {
        int srclen = src.length (), sflen = strfind.length ();
        if (srclen < 1 || sflen < 1 || srclen < sflen) return -1;
 
        std::hash_map<charT, int>::iterator iter;
        for (int i = pos + sflen - 1; i < srclen; i += sflen) {
            iter = hm.lower_bound (src [i]);
            while (iter != hm.end ()) {
                if (!memcmp (&src [i - iter->second], &strfind [0], sflen)) return i - iter->second;
                iter++;
            }
        }
        return -1;
    }
 
    //反向查找算法实现
    static int _base_find_reverse (std::basic_string<charT>& src, std::basic_string<charT>& strfind, std::hash_map<charT, int>& hm, int pos) {
        int srclen = src.length (), sflen = strfind.length ();
        if (srclen < 1 || sflen < 1 || srclen < sflen) return -1;
 
        std::hash_map<charT, int>::iterator iter;
        for (int i = pos - sflen + 1; i >= 0; i -= sflen) {
            iter = hm.lower_bound (src [i]);
            while (iter != hm.end ()) {
                if (i < iter->second) break;
                if (!memcmp (&src [i - iter->second], &strfind [0], sflen)) return i - iter->second;
                iter++;
            }
        }
        return -1;
    }
 
public:
    //正向查找
    static int find (std::basic_string<charT>& src, std::basic_string<charT>& strfind, int pos = 0) {
        return longstr::_base_find (src, strfind, longstr::get_hm (strfind), pos);
    }
 
    //正向查找,回调返值
    static void find (std::basic_string<charT>& src, std::basic_string<charT>& strfind, bool (*callback)(int)) {
        std::hash_map<charT, int> hm = longstr::get_hm (strfind);
        int pos = longstr::_base_find (src, strfind, hm, 0);
        while (pos != -1 && callback (pos)) pos = longstr::_base_find (src, strfind, hm, pos + 1);
    }
 
    //反向查找
    static int find_reverse (std::basic_string<charT>& src, std::basic_string<charT>& strfind, int pos = 0) {
        if (!pos) pos = src.length () - 1;
        return longstr::_base_find_reverse (src, strfind, longstr::get_hm (strfind), pos);
    }
 
    //反向查找,回调返值
    static void find_reverse (std::basic_string<charT>& src, std::basic_string<charT>& strfind, bool (*callback)(int)) {
        std::hash_map<charT, int> hm = longstr::get_hm (strfind);
        int pos = longstr::_base_find_reverse (src, strfind, hm, 0);
        while (pos != -1 && callback (pos)) pos = longstr::_base_find_reverse (src, strfind, hm, pos - 1);
    }
 
    //获取找到的目标字符串的个数
    static int find_count (std::basic_string<charT>& src, std::basic_string<charT>& strfind) {
        std::hash_map<charT, int> hm = longstr::get_hm (strfind);
        int pos = longstr::_base_find (src, strfind, hm, 0), count = 0;
        while (pos != -1) {
            count++;
            pos = longstr::_base_find (src, strfind, hm, pos + 1);
        }
        return count;
    }
};
 
typedef longstr<char> longstr_a;
typedef longstr<wchar_t> longstr_w;
 
#endif //_LONGSTR_HPP_

示例调用代码:

1
2
3
4
5
6
7
8
9
10
11
#include <iostream>
#include <string>
using namespace std;
#include "longstr.hpp"
 
int main(int argc, char* argv[]) {
    string a = "fhaso4dfhui123456sdahfuiasd123456hfiusadifbsaifd", b = "123456";
    int i = longstr_a::find_reverse (a, b);
    cout << i << endl;
    return 0;
}

Published by

fawdlstty

又一只萌萌哒程序猿~~

发表评论

电子邮件地址不会被公开。 必填项已用*标注