Description:
Regular Expressions are a form of pattern-matching often used in text processing. C++ Boost provides regex support with a lot of options and configurations to match one's needs. C++11 supports regex, but Boost.Regex has certain additional functionalities like different syntax options and their variants such as grep in POSIX basic.
Available Functions :
Header File:
Compilation :
g++ fileName.cpp /usr/local/lib/libboost_regex.a
Boost.Regex is a separately compiled library component, so it has to be linked with the program while compiling
"/usr/local/lib/libboost_regex.a" - path to the libboost_regex (Might change according to one's boost installation)
Regex syntax available in Boost regex:
Available Functions :
Available variations in function call along with flags are explained in the code
Quick-link to references for syntax options and flags
#include <boost/regex.hpp>
#include <string>
#include <iostream>
class BoostRegex
{
public:
//Boost Regex Match
void regexMatchBasic(std::string sentenceTrue, std::string sentenceFalse, std::string regex)
{
boost::regex expression(regex);
// regex_match returns 0 or 1 boolalpha is used to translate it to false and true respectively
//True
std::cout << " " << std::boolalpha << boost::regex_match(sentenceTrue, expression) << std::endl;
//False
std::cout << " " << std::boolalpha << boost::regex_match(sentenceFalse, expression) << std::endl;
}
void regexMatchWithSmatch(std::string sentence, std::string regex)
{
static boost::regex expression(regex);
boost::smatch match ; //Is used to capture the match results
std::cout << " " << std::boolalpha << boost::regex_match(sentence, match, expression) << std::endl;
std::cout << " " << "Captured subexpressions" << std::endl;
std::cout << " " << match[0] << " - Full match" << std::endl;
std::cout << " " << match[1] << " - The subexpression match 1" << std::endl;
std::cout << " " << match[2] << " - The subexpression match 2" << std::endl;
}
void regexMatchWithFlag(std::string sentence, std::string regex, boost::match_flag_type flag)
{
static boost::regex expression(regex);
//Other match_flag_types are explained at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/ref/match_flag_type.html
//smatch to captured the results
boost::smatch match;
std::cout << "With match_nosubs flag - only the overall match will be captured" << std::endl;
std::cout << " " << std::boolalpha << boost::regex_match(sentence, match, expression, flag) << std::endl ;
std::cout << " " << match[0] << " - Full Match" << std::endl;
std::cout << " " << match[1] << " - No matched subexpression\n" << std::endl;
std::cout << "\nWithout match_nosubs flag - overall match along with the marked subexpression will be captured" << std::endl;
std::cout << " " << std::boolalpha << boost::regex_match(sentence, match, expression) << std::endl ;
std::cout << " " << match[0] << " - Full Match" << std::endl;
std::cout << " " << match[1] << " - The matched subexpression" << std::endl;
}
//Boost Regex Search
void regexSearchBasic(std::string sentence, std::string regexSub, std::string regexNoSub)
{
boost::regex expressionWithoutSub(regexNoSub);
boost::regex expressionWithSub(regexSub);
//To capture the results
boost::smatch match;
//String iterator for searching
std::string::const_iterator startIterator, endIterator;
startIterator = sentence.begin();
endIterator = sentence.end();
std::cout << " " << std::boolalpha << boost::regex_search(startIterator, endIterator, match, expressionWithoutSub) << std::endl;
std::cout << " " << match[0] << std::endl; // Index 0 Holds the full match
std::cout << " " << match[1] << " - No subexpression" << std::endl; // The subsequent indices hold the subsequent subexpression captures
//With subexpression
std::cout << " " << std::boolalpha << boost::regex_search(startIterator, endIterator, match, expressionWithSub) << std::endl;
std::cout << " " << match[0] << " - Full Match" << std::endl; // Index 0 Holds the full match
std::cout << " " << match[1] << " - Matched subexpression" << std::endl; // The subsequent indices hold the subsequent subexpression captures
}
void regexSearchWithoutSmatch(std::string sentence, std::string regex)
{
boost::regex expression(regex);
//Search and return 0 or 1 as per result
std::cout << " " << std::boolalpha << boost::regex_search(sentence, expression) << std::endl;
}
void regexSearchWithoutIterator(std::string sentence, std::string regex)
{
boost::regex expression(regex);
//Search and return 0 or 1 as per result
std::cout << " " << std::boolalpha << boost::regex_search(sentence, expression) << std::endl;
}
void regexSearchWithFlag(std::string sentence, std::string regex, boost::match_flag_type flag)
{
boost::regex expression(regex);
//Flag to accept partial matches
// boost::match_flag_type flag = boost::match_partial;
boost::smatch match;
//With partial match flag
std::cout << " " << std::boolalpha << boost::regex_search(sentence, match, expression, flag) << std::endl;
std::cout << " " << *match[0].first << std::endl;//Prints the start of the partial match
//Without Partial match Flag returns false
std::cout << " " << std::boolalpha << boost::regex_search(sentence, expression) << std::endl;
}
//Boost Regex Replace
void regexReplaceBasic(std::string sentence, std::string regex, std::string format)
{
boost::regex expression(regex);
std::cout << " " << boost::regex_replace(sentence, expression, format) << std::endl;
}
void regexReplaceWithReference(std::string sentence, std::string regex, std::string format)
{
boost::regex expression(regex);
std::cout << " " << boost::regex_replace(sentence, expression, format) << std::endl;
}
void regexReplaceWithFlag(std::string sentence, std::string regex, std::string format, boost::match_flag_type flag)
{
boost::regex expression(regex);
//boost::regex_constants::format_literal - Flag to supress the handling of special characters in format string, Replaces with the literal copy
std::cout << " " << boost::regex_replace(sentence, expression, format, flag) << std::endl;
}
void regexReplaceSedFormatString(std::string sentence, std::string regex, std::string sedFormat)
{
boost::regex expression(regex);
//Flag to set the SED format
boost::match_flag_type flag = boost::regex_constants::format_sed;
std::cout << " " << boost::regex_replace(sentence, expression, sedFormat, flag) << std::endl;
}
void regexReplacePerlFormatString(std::string sentence, std::string regex, std::string perlFormat)
{
boost::regex expression(regex);
boost::match_flag_type flag = boost::regex_constants::format_perl;
std::cout << " " << boost::regex_replace(sentence, expression, perlFormat, flag) << std::endl;
}
void regexReplaceBoostExtendedFormatString(std::string sentence, std::string regex, std::string boostExtendedFormat)
{
boost::regex expression(regex);
boost::match_flag_type flag = boost::regex_constants::format_all;
std::cout << " " << boost::regex_replace(sentence, expression, boostExtendedFormat, flag) << std::endl;
}
void regexIteratorWithoutFlag(std::string sentence, std::string regex)
{
boost::regex expression(regex);
std::string::const_iterator startIteraor, endIterator;
startIteraor = sentence.begin();
endIterator = sentence.end();
boost::sregex_iterator iterator(startIteraor, endIterator, expression);
std::cout << " " << (*iterator)[0] << std::endl;
std::cout << " " << (*iterator)[1] << std::endl;
std::cout << " " << (*iterator)[2] << std::endl;
std::cout << " " << *(*iterator)[1].first << std::endl;
std::cout << " " << *(*iterator)[2].first << std::endl;
}
void regexIteratorWithFlag(std::string sentence, std::string regex, boost::match_flag_type flag)
{
boost::regex expression(regex);
std::string::const_iterator startIteraor, endIterator;
startIteraor = sentence.begin();
endIterator = sentence.end();
boost::sregex_iterator iterator(startIteraor, endIterator, expression, flag);
std::cout << " " << (*iterator)[0] << std::endl;
std::cout << " " << (*iterator)[1] << std::endl;
}
void regexTokenIteratorBasicWithSubmatch(std::string sentence, std::string regex)
{
boost::regex expression(regex);
//Passsing 1 returns the first subgroup matches in the expression
boost::sregex_token_iterator store(sentence.begin(), sentence.end(), expression, 1);
boost::sregex_token_iterator checkEnd;
while(store != checkEnd)
{
std::cout << " " << *store++ << std::endl;
}
//passing -1 makes the expression act as delimiter, returns all the non matches
boost::sregex_token_iterator storeNoMatch(sentence.begin(), sentence.end(), expression, -1);
while(storeNoMatch != checkEnd)
{
std::string item = *storeNoMatch++ ;
if(item != " ")
std::cout << " " << item << std::endl;
}
//passing 0 returns all the matches
boost::sregex_token_iterator storeMatch(sentence.begin(), sentence.end(), expression, 0);
while(storeMatch != checkEnd)
{
std::cout << " " << *storeMatch++ << std::endl ;
}
}
void regexTokenIteratorWithoutSubmatch(std::string sentence, std::string regex)
{
boost::regex expression(regex);
boost::sregex_token_iterator checkEnd;
//The default submatch argument is 0 so, returns all the matches
boost::sregex_token_iterator storeMatch(sentence.begin(), sentence.end(), expression);
while(storeMatch != checkEnd)
{
std::cout << " " << *storeMatch++ << std::endl ;
}
}
//Locale using imbue
void linkingLocale(std::string sentence, std::string regex, std::string locale)
{
boost::regex expression;
expression.imbue(std::locale{locale});
expression = regex;
std::cout << " " << std::boolalpha << boost::regex_match(sentence, expression) << std::endl;
}
//Perl Regular Expression
void perlRegularExpression(std::string sentence, std::string regex)
{
//Specifies that the regex engine recognizes string using its normal semantics
//icase - specifies that the matching is done without regarding the case
boost::regex expression(regex, boost::regex_constants::ECMAScript|boost::regex_constants::icase);
//The available options for perl regex can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_perl.html
std::cout << " " << std::boolalpha << boost::regex_match(sentence, expression) << std::endl;
}
//POSIX Basic regex
void posixBasicRegex(std::string sentence, std::string regex)
{
//Specifies that the regex will be treated with the GREP variant of POSIX basic
//Grep treats \n newline character as an alternative character. An alteranative character is like OR operator
boost::regex expression(regex, boost::regex_constants::grep);
// The available options for POSIX basic Regex can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_basic.html
std::cout << " " << std::boolalpha << boost::regex_match(sentence, expression) << std::endl;
}
//POSIX Extended regex
void posixExtendedRegex(std::string sentenece, std::string regex)
{
//Specifies that the regex will be treated with the regex engine of POSIX extended Regular Expression
//nosubs - Specifies that when a regex is matched against a character container sequence(Subexpression), then no subexpression matches are to be stored
boost::regex expresssion(regex, boost::regex_constants::extended|boost::regex_constants::nosubs);
// The available options for POSIX Extended Regex can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_extended.html
boost::smatch match;
std::cout << " " << std::boolalpha << boost::regex_match(sentenece, match, expresssion) << std::endl;
std::cout << " " << match[0] << "- Full Match" << std::endl;
std::cout << " " << match[1] << "- No Subexpression match is captured" << std::endl;
}
//Literal String
void literalStrings(std::string sentence, std::string regex)
{
//Specifies that the regex has to be treated as a literal string while matching
//optimize - specifies the regex engine to pay more attention to speed, has no visible effect in the output
// This currently has no effect for Boost.Regex
boost::regex expression(regex, boost::regex_constants::literal|boost::regex_constants::optimize);
//The availble options for literal strings can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_literal.html
std::cout << " " << std::boolalpha << boost::regex_match(sentence, expression) << std::endl;
}
//Error Handling
void regexError(std::string regex)
{
try
{
boost::regex expression(regex);
}
catch(const boost::regex_error e)
{
std::cerr << " " << e.what() << "\n Error code : " << e.code() << "\n Error position : " << e.position() << std::endl;
}
}
};
int main()
{
BoostRegex obj;
/* ---------------------------------------------------------regex_match--------------------------------------------------------------------------------------------- */
std::cout << "\n **********Boost Regex Match**********" << std::endl;
//The basic format takes in the test string and the regex expression
obj.regexMatchBasic("Word", "047", "d+");
std::cout << "\n **********Boost Regex Match with Flag**********" << std::endl;
// Along with the test string and the regex a flag can be passsed to alter the way of checking as per need.
// The match_flag_type available can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/ref/match_flag_type.html
obj.regexMatchWithFlag("Regex 9w", "(w+)sd+w+", boost::match_nosubs);
std::cout << "\n **********Boost Regex Match with smatch**********" << std::endl;
//boost::smatch is used to store the regex match and subexpression matches. a subexpression is usually enclosed with parentheses ()
obj.regexMatchWithSmatch("subexpression 7047", "(w+)s(d+)");
/* ----------------------------------------------------------regex_search-------------------------------------------------------------------------------------------- */
std::cout << "\n **********Boost Regex Search********** " << std::endl;
// The test string will be searched with the regex and returns 0 or 1 based on the results. The below shows the difference in match capture between subexpresssion and no subexpression
obj.regexSearchBasic("Regex 047 subexpression test", "(d{3})", "d{3}");
std::cout << "\n **********Boost Regex Search without smatch********** " << std::endl;
// Regex search can be used without smatch to store the results
obj.regexSearchWithoutSmatch("demo Dart89", "(w+d{2})");
std::cout << "\n **********Boost Regex Search without Iterator********** " << std::endl;
// Similarly the search can be done by passing the string instead of the start and end const string iterators
obj.regexSearchWithoutIterator("989505", "[a-z]+");
std::cout << "\n **********Boost Regex Search with Flag********** " << std::endl;
//Just like in regex match flags can be set to alter the way of search
// Partial match accepts partial matches in the test string
obj.regexSearchWithFlag("Dart90", "d{3}", boost::match_partial);
/* --------------------------------------------------------regex_replace---------------------------------------------------------------------------------------------- */
std::cout << "\n **********Boost Regex Replace********** " << std::endl;
// The regex matches in the test string will be replaced with the format string specified
obj.regexReplaceBasic(" replaced1 replaced2 95822", "([a-z]+)", "demo");
std::cout << "\n **********Boost Regex Replace with reference********** " << std::endl;
// The presence of subexpressions can be used as references for reordering the test string, The subexpressions can be accessed in the format string using
n denoting the subexpression number
obj.regexReplaceWithReference("is this test working", "(w+)s(w+)s(w+)", "2 3 1");
std::cout << "\n **********Boost Regex Replace with Flag********** " << std::endl;
// Like regex search and match , replace also supports flags. flag - format_literal supresses the escape sequence specied in the format string
obj.regexReplaceWithFlag("is this test working", "(w+)s(w+)s(w+)", "2 3 1", boost::regex_constants::format_literal);
std::cout << "\n **********Boost Regex Replace using SED format string********** " << std::endl;
//The format string in replace can be provided in 3 syntax types, namely sed,perl and boost extended
//The below uses a sed syntax string as format string. More info about sed format can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/format/sed_format.html
//The below throw a escape sequence warning while compiling since the sed syntax isn't recognised by the compiler
//"8 9" Format to replace the matched part with "8 9"
obj.regexReplaceSedFormatString("replaced demo", "(w+)s", "8 9");
std::cout << "\n **********Boost Regex Replace using PERL format string********** " << std::endl;
//The below uses perl syntax for the format string, More information about perl syntax for format string can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/format/perl_format.html
// $2 - returns the 2nd matched subexpression syntax - - n represents the subexpression number
// thus the whole matched regex will be replaced by the second subexpression
obj.regexReplacePerlFormatString("This subexpression2 test perl", "(w+)s(w+)s(w+)", "$2");
std::cout << "\n **********Boost Regex Replace using boost extended format string********** " << std::endl;
// The below uses Boost Extended syntax for the format string, More information about boost extended syntax can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/format/boost_format_syntax.html
// $' - returns all the text following the end of the current match.
//the current match in the below is 047 so it will be replaced with all the text after the match
obj.regexReplaceBoostExtendedFormatString("Ajay 047 partAfterMatch", "(d{3})", "$'");
/* ---------------------------------------------------regex_iterator--------------------------------------------------------------------------------------------------- */
std::cout << "\n **********Boost Regex Iterator without Flag********** " << std::endl;
//Regex Iterator works similar to smatch , It enumerates all the regex matches
//Dereferencing the iterator yeild a reference to the smatch object
obj.regexIteratorWithoutFlag("999 555 45", "(d+)s(d+)");
std::cout << "\n **********Boost Regex Iterator with Flag********** " << std::endl;
//Similar to other regex operations like search , replace, and match a flag can be passed to the iterator to alter the way of working as per need
obj.regexIteratorWithFlag("999 555 45", "(d+)s(d+)", boost::regex_constants::match_nosubs);
/* ---------------------------------------------------regex_token_iterator--------------------------------------------------------------------------------------------------- */
std::cout << "\n **********Boost Regex Token Iterator with submatch********** " << std::endl;
// Regex token Iterator is an iterator adaptor. It holds the smatch results
// It takes an integer n for submatch and the working alters as per its value
// the value n is usually refers to subexpression in the regex whose's value has to be stored
// Default submatch value is 0 ,returns all the matches
// -1 as n ,makes the expression act as delimiter, returns all the non matches
obj.regexTokenIteratorBasicWithSubmatch("test Demo Words98 *$", "(w)w+");
std::cout << "\n **********Boost Regex Token Iterator without submatch********** " << std::endl;
//Using regex token iterator without passing submatch will result in same results as submatch = 0. 0 is the default value
obj.regexTokenIteratorWithoutSubmatch("test Demo Words98 *$", "(w)w+");
/* ---------------------------------------------------------Locale using imbue--------------------------------------------------------------------------------------------- */
std::cout << "\n **********Boost Linking Locale********** " << std::endl;
// The regex expression can be linked to the locale to test locale strings
// Uses the imbue function in c++ . Imbue is used to set the locale of the stream
// The locale of your system has to be configured with "sudo dpkg-reconfigure locales" to accept the Japanese locale
obj.linkingLocale("Haiky%u016B", "(w+)", "ja_JP.UTF-8"); // Japanese Haiky%u016B
// 6B will throw a warning as the hex code for %u016B is misunderstood as escape sequence
obj.linkingLocale("Haiky6B", "(w+)", "ja_JP.UTF-8"); // Japanese Haiky%u016B %u016B have to been given as utf-8 hex value
/* ----------------------------------------------------------Regex Perl-------------------------------------------------------------------------------------------- */
// Boost regex provides four syntaxs for the regular expression
// Namely Perl, POSIX basic, POSIX extended and Literal strings
std::cout << "\n **********Boost Regex Perl********** " << std::endl;
// More information about perl syntax options can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_perl.html
// Perl syntax https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/syntax/perl_syntax.html
obj.perlRegularExpression("IGNORE CASE", "[a-z]+s[a-z]+");
/* ----------------------------------------------------------Literal Strings-------------------------------------------------------------------------------------------- */
std::cout << "\n **********Boost Regex Literal String********** " << std::endl;
//More information about Literal string options can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_literal.html
//Literal string takes the regex as a literal string for matching
obj.literalStrings("Literal String", "Literal String");
/* ----------------------------------------------------------POSIX Basic-------------------------------------------------------------------------------------------- */
std::cout << "\n **********Boost Regex POSIX Basic********** " << std::endl;
//More information about POSIX Basic syntax options can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_basic.html
//POSIX Basic syntax https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/syntax/basic_syntax.html
obj.posixBasicRegex("047", "[a-z]*\n[0-9]*");
/* -----------------------------------------------------------POSIX Extended------------------------------------------------------------------------------------------- */
std::cout << "\n **********Boost Regex POSIX Extended**********" << std::endl;
//More information about POSIX Extended syntax option can be found at https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_extended.html
//POSIX Extended syntax https://www.boost.org/doc/libs/1_69_0/libs/regex/doc/html/boost_regex/syntax/basic_extended.html
obj.posixExtendedRegex("Extended Posix", "(w+)s(w+)");
/* ------------------------------------------------------------Exception Handling------------------------------------------------------------------------------------------ */
std::cout << "\n **********Regex Exception Handling**********" << std::endl;
// Incomplete subexpression is passed as argument which results in an exception
obj.regexError("(w");
/* ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ */
return 0;
}
Output :
**********Boost Regex Match**********
false
true
**********Boost Regex Match with Flag**********
With match_nosubs flag - only the overall match will be captured
true
Regex 9w - Full Match
- No matched subexpression
Without match_nosubs flag - overall match along with the marked subexpression will be captured
true
Regex 9w - Full Match
Regex - The matched subexpression
**********Boost Regex Match with smatch**********
true
Captured subexpressions
subexpression 7047 - Full match
subexpression - The subexpression match 1
7047 - The subexpression match 2
**********Boost Regex Search**********
true
047
- No subexpression
true
047 - Full Match
047 - Matched subexpression
**********Boost Regex Search without smatch**********
true
**********Boost Regex Search without Iterator**********
false
**********Boost Regex Search with Flag**********
true
9
false
**********Boost Regex Replace**********
demo1 demo2 95822
**********Boost Regex Replace with reference**********
this test is working
**********Boost Regex Replace with Flag**********
2 3 1 working
**********Boost Regex Replace using SED format string**********
8 9demo
**********Boost Regex Replace using PERL format string**********
subexpression2 perl
**********Boost Regex Replace using boost extended format string**********
Ajay partAfterMatch partAfterMatch
**********Boost Regex Iterator without Flag**********
999 555
999
555
9
5
**********Boost Regex Iterator with Flag**********
999 555
**********Boost Regex Token Iterator with submatch**********
t
D
W
*$
test
Demo
Words98
**********Boost Regex Token Iterator without submatch**********
test
Demo
Words98
**********Boost Linking Locale**********
false
true
**********Boost Regex Perl**********
true
**********Boost Regex Literal String**********
true
**********Boost Regex POSIX Basic**********
true
**********Boost Regex POSIX Extended**********
true
Extended Posix- Full Match
- No Subexpression match is captured
**********Regex Exception Handling**********
Unmatched marking parenthesis ( or (. The error occurred while parsing the regular expression: '(w>>>HERE>>>'.
Error code : 8
Error position : 3
Name | Views | Likes |
---|---|---|
C++ Boost::Regex::ICU | 772 | 1 |
C++Boost::Regex | 2355 | 9 |
C++ Boost::Test | 1863 | 0 |
C++ program to print the longest leaf to leaf path in a binary tree | 1027 | 0 |
C++ Boost::Test::BOOST_TEST_CASE | 386 | 0 |
C++ Boost::Test::BOOST_level_EQUAL | 412 | 1 |
C++ Boost::Test::BOOST_TEST_SUITE | 496 | 0 |
C++ Boost::Regex | 1093 | 0 |
C++ Boost::Test::BOOST_AUTO_TEST_CASE | 2070 | 0 |
C++ Boost::Test::BOOST_AUTO_TEST_SUITE | 1310 | 0 |
Comments