...string_searchingstring_searchingstring_searching.cpp 1
#include <vector>
#include <string>
#include <algorithm>
#include <iostream>
/**
*
* @copyright Copyright Dr Russell John Childs, Phd, 2020-01-14
*
* Find occurrences of a substr S of length m, in a target string T of length n.
* Order-of-complexity: O(n+m) in time, O(1) in space.
*
* Operation:
* (1) Convert char to integer, I.
* (2) Use std::pair sliding window, w, with cumulative sums of I at positions
* p and p-m of target, where p is the range (substr length, target length).
* (3) Compare cumulative sum of I for substr against w.second - w.first
* (4) If (3) succeeds, perform O(m) char-by-char check that S=T[p-m]...T[m].
* (5) if (4) succeeds, append p-m to return vector of "found positions".
*
* Notes: Code is simpler than trie or KMP. Compiled under Visual Studio.
* The O(m) verification step (4) is only necessary because "+" commutes, which
* poses a problem for anagrams, a+b = b+a. There is a Group Theoretic way of
* creating an O(1)-updatable hash function that eliminates the need for (4).
* A char encoding, such as c -> (c-' ')*('~' - ' '), may differentiate between
* some same-length strings with different chars and product(c->prime) may
* differentiate between all same-length strings with differents chars, but not
* anagrams.
*
* @param substr : const std::string& - substring sought in target
* @param target : const std::string& - target string being searched
* @return std::vector<unsigned> - positions in target of matches, none -> empty
*
*/
std::vector<unsigned>
find(const std::string& substr, const std::string& target)
{
std::vector<unsigned> ret_val;
std::pair<unsigned long long, unsigned long long> window(0, 0);
unsigned signature = 0;
for (unsigned i = 0; (target.length() >= substr.length())
&& (i < target.length()); ++i)
{
//Get cumulative sum at pos i of target string being searched
window.second += target[i];
//Get cumulative sum for substring being sought
if (i < substr.length())
{
signature += substr[i];
}
if (i >= substr.length()-1)
{
//Check if cumul sum of substr == cumul sum of window
if ((window.second - window.first) == signature)
{
...string_searchingstring_searchingstring_searching.cpp 2
//Perform O(m) check of letter-by-letter match
bool match = true;
for (unsigned j = 0; j < substr.length(); ++j) match &=
(substr[j] == target[i - substr.length() + j + 1]);
//If matched, add index to vector of found positions
if (match) ret_val.push_back(i - substr.length() + 1);
}
//Get cumul sum at pos i - m of target string being searched
window.first += target[i - substr.length() + 1];
}
}
return ret_val;
}
int main(void)
{
//Bounds check on target length < substr length, expect empty list
std::string substr = "Amazon";
std::string target = "Amazo";
std::cout << substr << " found in " << target << " at ";
std::cout << std::endl;
//Check for substr at beginning and end, expect 0 and 19
target = "Amazon----Amazo----Amazon";
std::cout << substr << " found in " << target << " at ";
for (auto& elem : find(substr, target)) std::cout << elem << " ";
std::cout << std::endl;
//Check for substr in middle, expect 9
target = "Amazo----Amazon----mazon";
std::cout << substr << " found in " << target << " at ";
for (auto& elem : find(substr, target)) std::cout << elem << " ";
std::cout << std::endl;
//Check for repeated substr, expect 0,6,12,18,24,30
target = "AmazonAmazonAmazonAmazonAmazonAmazon";
std::cout << substr << " found in " << target << " at ";
for (auto& elem : find(substr, target)) std::cout << elem << " ";
std::cout << std::endl;
//Check for substr not matched, expect empty list
target = "Ama----z----on";
std::cout << substr << " found in " << target << " at ";
for (auto& elem : find(substr, target)) std::cout << elem << " ";
std::cout << std::endl;
}

String searching

  • 1.
    ...string_searchingstring_searchingstring_searching.cpp 1 #include <vector> #include<string> #include <algorithm> #include <iostream> /** * * @copyright Copyright Dr Russell John Childs, Phd, 2020-01-14 * * Find occurrences of a substr S of length m, in a target string T of length n. * Order-of-complexity: O(n+m) in time, O(1) in space. * * Operation: * (1) Convert char to integer, I. * (2) Use std::pair sliding window, w, with cumulative sums of I at positions * p and p-m of target, where p is the range (substr length, target length). * (3) Compare cumulative sum of I for substr against w.second - w.first * (4) If (3) succeeds, perform O(m) char-by-char check that S=T[p-m]...T[m]. * (5) if (4) succeeds, append p-m to return vector of "found positions". * * Notes: Code is simpler than trie or KMP. Compiled under Visual Studio. * The O(m) verification step (4) is only necessary because "+" commutes, which * poses a problem for anagrams, a+b = b+a. There is a Group Theoretic way of * creating an O(1)-updatable hash function that eliminates the need for (4). * A char encoding, such as c -> (c-' ')*('~' - ' '), may differentiate between * some same-length strings with different chars and product(c->prime) may * differentiate between all same-length strings with differents chars, but not * anagrams. * * @param substr : const std::string& - substring sought in target * @param target : const std::string& - target string being searched * @return std::vector<unsigned> - positions in target of matches, none -> empty * */ std::vector<unsigned> find(const std::string& substr, const std::string& target) { std::vector<unsigned> ret_val; std::pair<unsigned long long, unsigned long long> window(0, 0); unsigned signature = 0; for (unsigned i = 0; (target.length() >= substr.length()) && (i < target.length()); ++i) { //Get cumulative sum at pos i of target string being searched window.second += target[i]; //Get cumulative sum for substring being sought if (i < substr.length()) { signature += substr[i]; } if (i >= substr.length()-1) { //Check if cumul sum of substr == cumul sum of window if ((window.second - window.first) == signature) {
  • 2.
    ...string_searchingstring_searchingstring_searching.cpp 2 //Perform O(m)check of letter-by-letter match bool match = true; for (unsigned j = 0; j < substr.length(); ++j) match &= (substr[j] == target[i - substr.length() + j + 1]); //If matched, add index to vector of found positions if (match) ret_val.push_back(i - substr.length() + 1); } //Get cumul sum at pos i - m of target string being searched window.first += target[i - substr.length() + 1]; } } return ret_val; } int main(void) { //Bounds check on target length < substr length, expect empty list std::string substr = "Amazon"; std::string target = "Amazo"; std::cout << substr << " found in " << target << " at "; std::cout << std::endl; //Check for substr at beginning and end, expect 0 and 19 target = "Amazon----Amazo----Amazon"; std::cout << substr << " found in " << target << " at "; for (auto& elem : find(substr, target)) std::cout << elem << " "; std::cout << std::endl; //Check for substr in middle, expect 9 target = "Amazo----Amazon----mazon"; std::cout << substr << " found in " << target << " at "; for (auto& elem : find(substr, target)) std::cout << elem << " "; std::cout << std::endl; //Check for repeated substr, expect 0,6,12,18,24,30 target = "AmazonAmazonAmazonAmazonAmazonAmazon"; std::cout << substr << " found in " << target << " at "; for (auto& elem : find(substr, target)) std::cout << elem << " "; std::cout << std::endl; //Check for substr not matched, expect empty list target = "Ama----z----on"; std::cout << substr << " found in " << target << " at "; for (auto& elem : find(substr, target)) std::cout << elem << " "; std::cout << std::endl; }