C++ Boost::Regex::ICU














































C++ Boost::Regex::ICU




Description : 

  Regular expressions are a form of pattern-matching often used in text processing.Boost::Regex::ICU provides regex support for ICU strings. The regex objects will be aware of Unicode strings.The header allows to create regex that treats Unicode strings as UTF code points and to transparently search Unicode strings

ICU - International Components for Unicode

 

All the available Unicode aware functions are wrapper functions upon the basic Boost.Regex functions, Detailed explanation on the basic functions can be found at the below link.


https://cppsecrets.com/users/14249710697121115114971069748525564103109971051084699111109/C00-BoostRegex.php


 

Requirement :

  ICU Library has to be installed in the system, Can be downloaded and installed from the below link.


http://site.icu-project.org/download/


Header : 

  <boost/regex/icu.hpp>

 

Available Functions :


  • make_u32regex
    • To create a Unicode aware regex object

boost::make_u32regex(expression);


  • u32regex_match
    • To test the Unicode regex with the string

boost::u32regex_match(sentence, regexU32)


  • u32regex_search
    • To search the test string with the Unicode aware regex for any match 

boost::u32regex_search(sentence, matchU32, regexU32)


  • u32regex_replace
    • To replace the test string with the format string given if the test string matches the Unicode aware regex

boost::u32regex_replace(sentence, regexU32, format)


  • u32regex_iterator
    • Holds the reference to match_results, when dereferenced give the match result reference

boost::u32regex_iterator<Iterator>iterator(boost::make_u32regex_iterator(sentence, regexU32));


  • u32regex_token_iterator
    • To iterate over the tokens generated in the match. It's an iterator adaptor

boost::u32regex_token_iterator<Iterator>iterator(boost::make_u32regex_token_iterator(sentence, regexU32, 0));

 

Compilation : 

  Boost.Regex is a separately compiled library component, so it has to be linked with the program while compiling. Since ICU is seperately installed, ICU library should also be linked to the program.


 

The linking can be done with either of the following ways (The library path may vary )

  • g++ test.cpp /usr/local/lib/libboost_regex.a $(icu-config --ldflags)
  • g++ test.cpp /usr/local/lib/libboost_regex.a -L/usr/local/lib -licui18n -licuuc -licudata 


 Code : 



#include<iostream>

#include<boost/regex.hpp>

#include<boost/regex/icu.hpp>


class RegexICU

{

public:

//There are a few ways in which a unicode aware regex can be created.

void makeU32Regex(std::string sentence);


// Boost regex ICU provides regex_match, regex_search, regex_replace

// Under the names u32regex_match, u32regex_search, u32regex_replace respectively

// Supports UTF-8, UTF-16 code points and unicode data


// u32regex_match

void u32RegexMatch(std::string sentenece, std::string expression);


// u32regex_search

void u32RegexSearch(std::string sentence, std::string expression);


// u32regex_replace

void u32RegexReplace(std::string sentence, std::string format, std::string expression);


// u32regex_iterator

void u32RegexIterator(std::string sentence, std::string expression);


// u32regex_token_iterator

void u32RegexTokenIterator(std::string sentence, std::string expression);

};


void RegexICU::makeU32Regex(std::string sentence)

{

//Using a string

// Syntax option flags are optional

std::string expression = "(w+)sd{2}";

boost::u32regex regex = boost::make_u32regex(expression);


// Using iterators - creates a regex object from the iterator sequence.

// The character encoding of the sequence is determined based upon sizeof(*iterator)

std::string::iterator start,end;

start = expression.begin();

end = expression.end();

// The perl syntax option flag is mandatory while using iterators as per syntax

boost::u32regex regex2 = boost::make_u32regex(start, end, boost::regex_constants::perl);


// const char* - creates a regex object from the NULL terminated UTF-8 character sequence

// Syntax option flags are optional

const char* expressionCharPointer = "(w+)sd{2}";

boost::u32regex regex3 = boost::make_u32regex(expressionCharPointer);


//const wchar_t* (Wide char) - Creates a regex object from the Null-terminated character sequence

// The character encoding sequence is determined based upon sizeof(wchar_t)

// wchar_t is used to represent characters which require more memory to represent them than a regular char

const wchar_t* expressionWcharT = L"(w+)sd{2}";

boost::u32regex regex4 = boost::make_u32regex(expressionWcharT);


// Unicodestring - creates a regex object from the UTF-16 encoding string

// UnicodeString is a string class that stores Unicode characters directly

const UnicodeString expressionUcodeString = "(w+)sd{2}";

boost::u32regex regex5 = boost::make_u32regex(expressionUcodeString);


std::cout << " Using string regex" << std::endl;

std::cout << std::boolalpha << boost::u32regex_match(sentence, regex) << std::endl;

std::cout << " Using iterator regex" << std::endl;

std::cout << std::boolalpha << boost::u32regex_match(sentence, regex2) << std::endl;

std::cout << " Using char* regex" << std::endl;

std::cout << std::boolalpha << boost::u32regex_match(sentence, regex3) << std::endl;

std::cout << " Using wchar_t* regex" << std::endl;

std::cout << std::boolalpha << boost::u32regex_match(sentence, regex4) << std::endl;

std::cout << " Using Unicodestring regex" << std::endl;

std::cout << std::boolalpha << boost::u32regex_match(sentence, regex5) << std::endl;

}


void RegexICU::u32RegexMatch(std::string sentence, std::string expression)

{

// Creating a regex object with unicode awareness

boost::u32regex regexU32 = boost::make_u32regex(expression);


//Creating a regex object without unicode awareness for comaparison

boost::regex regex(expression);


//Checks for a match with the regex and the unicode string and returns 0 or 1 based on result

// Boolalpha converts 0 or 1 to boolean values false or true respectively

std::cout << " regex_match" << std::endl;

std::cout << std::boolalpha << boost::regex_match(sentence, regex) << std::endl;

std::cout << " u32regex_match" << std::endl;

std::cout << std::boolalpha << boost::u32regex_match(sentence, regexU32) << std::endl;

}


void RegexICU::u32RegexSearch(std::string sentence, std::string expression)

{

// Creating a regex object with unicode awareness

boost::u32regex regexU32 = boost::make_u32regex(expression);


//Creating a regex object without unicode awareness for comaparison

boost::regex regex(expression);


// smatch to store the match if any

boost::smatch matchU32, match;


// Searches the sentence string for the regex match and returns 0 or 1 as per match

// Boolalpha converts 0 or 1 to boolean values false or true respectively

std::cout << " regex_search\n" << std::endl;

std::cout << std::boolalpha << boost::regex_search(sentence, match, regex) << std::endl;

std::cout << " u32regex_search\n" << std::endl;

std::cout << std::boolalpha << boost::u32regex_search(sentence, matchU32, regexU32) << std::endl;

std::cout << "\nu32regex_search subexpression capture : " << matchU32[1] << std::endl;

}


void RegexICU::u32RegexReplace(std::string sentence, std::string format, std::string expression)

{

// Creating a regex object with unicode awareness

boost::u32regex regexU32 = boost::make_u32regex(expression);


//Creating a regex object without unicode awareness for comaparison

boost::regex regex(expression);


std::cout << boost::u32regex_replace(sentence, regexU32, format) << std::endl;

}


void RegexICU::u32RegexIterator(std::string sentence, std::string expression)

{

// Creating a regex object with unicode awareness

boost::u32regex regexU32 = boost::make_u32regex(expression);


// Creating regex_iterator

boost::u32regex_iterator<std::string::const_iterator> iterator(boost::make_u32regex_iterator(sentence, regexU32));

boost::u32regex_iterator<std::string::const_iterator> checkEndIterator;


while (iterator != checkEndIterator)

{

// Printing the subexpression 1 match

std::cout << (*iterator)[1] << std::endl;

// Printing the subexpression 2 match

std::cout << (*iterator)[2] << std::endl;

iterator++;

}

}


void RegexICU::u32RegexTokenIterator(std::string sentence, std::string expression)

{

// Creating a regex object with unicode awareness

boost::u32regex regexU32 = boost::make_u32regex(expression);


// Creating a regex token iterator with submatch as 0 - returns all the matches

boost::u32regex_token_iterator<std::string::const_iterator> iterator(boost::make_u32regex_token_iterator(sentence, regexU32, 0));

boost::u32regex_token_iterator<std::string::const_iterator> checkEndIterator;


while (iterator != checkEndIterator)

{

// Printing the tokens

std::cout << *iterator << std::endl;

iterator++;

}


}


int main()

{

RegexICU obj;

std::cout << "Creating Unicode aware regex" << std::endl;

obj.makeU32Regex("Integer 47");


std::cout << "**********Regex functions**********" << std::endl;

// Regex match with Unicode data

obj.u32RegexMatch("%u3053%u3093%u306B%u3061%u306F", "(w+)"); // %u3053%u3093%u306B%u3061%u306F - (Kon'nichiwa) Hello in Japanese



// Regex search with UTF-8 code points of %u3053%u3093%u306B%u3061%u306F%u3001%u5143%u6C17%u3067%u3059%u304B - (Kon'nichiwa, genkidesuka) hello how are you in Japanese

obj.u32RegexSearch("%u3053%u3093%u306B%u3061%u306F%u3001%u5143%u6C17%u3067%u3059%u304B", "(w+)");



std::cout << " u32regex_replace\n" << std::endl;

// Regex replace with UTF-16 code points of %u3053%u3093%u306B%u3061%u306F 1233

obj.u32RegexReplace("%u3053%u3093%u306B%u3061%u306F 1233", "Kon'nichiwa ", "(w+)s");


std::cout << " u32regex_iterator\n" << std::endl;

// Regex Iterator

obj.u32RegexIterator("%u3053%u3093%u306B%u3061%u306F 1233", "(w+)s(d+)");


std::cout << " u32regex_token_iterator\n" << std::endl;

// Regex Token Iterator

obj.u32RegexTokenIterator("%u3053%u3093%u306B%u3061%u306F 1233 %u6574%u6570 47", "(w+)s(d+)"); // %u6574%u6570 - (Seis%u016B) Integer in Japanese

}


Note : The Unicode data used may not be rendered as it is, It might be shown as UTF code points, Use the below link to an online decoder for decoding.

 

https://www.online-toolz.com/tools/text-unicode-entities-convertor.php


 Output : 

Creating Unicode aware regex

     Using string regex

true

     Using iterator regex

true

     Using char* regex

true

     Using wchar_t* regex

true

     Using Unicodestring regex

true

**********Regex functions**********

     regex_match

false

     u32regex_match

true

     regex_search


false

     u32regex_search


true


u32regex_search subexpression capture : %u3053%u3093%u306B%u3061%u306F

     u32regex_replace


Kon'nichiwa 1233

     u32regex_iterator


%u3053%u3093%u306B%u3061%u306F

1233

     u32regex_token_iterator


%u3053%u3093%u306B%u3061%u306F 1233

%u6574%u6570 47


 


Comments

  • Abu
    21-Nov-2019 09:28:49 PM
    Good Article Bro, Keep it up