decancer 3.2.4
A library that removes common unicode confusables/homoglyphs from strings.
 
Not such query exists :(
Functions
Type definitions
Macros
Loading...
Searching...
No Matches
decancer

decancer npm crates.io npm downloads crates.io downloads codacy ko-fi

A library that removes common unicode confusables/homoglyphs from strings.

  • Its core is written in Rust and utilizes a form of Binary Search to ensure speed!
  • By default, it's capable of filtering 221,529 (19.88%) different unicode codepoints like:
  • Unlike other packages, this package is unicode bidi-aware where it also interprets right-to-left characters in the same way as it were to be rendered by an application!
  • Its behavior is also highly customizable to your liking!

Installation

Download

Building from source

Building from source requires Rust v1.65 or later.

git clone https://github.com/null8626/decancer.git --depth 1
cd decancer/bindings/native
cargo build --release

And the binary files should be generated in the target/release directory.

Examples

For more information, please read the documentation.

UTF-8 example:

#include <decancer.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#define decancer_assert(expr, notes) \
if (!(expr)) { \
fprintf(stderr, "assertion failure at " notes "\n"); \
ret = 1; \
goto END; \
}
int main(void) {
int ret = 0;
// UTF-8 bytes for "vοΌ₯ⓑ𝔂 π”½π•ŒΕ‡β„•ο½™ ţ乇𝕏𝓣"
uint8_t input[] = {0x76, 0xef, 0xbc, 0xa5, 0xe2, 0x93, 0xa1, 0xf0, 0x9d, 0x94, 0x82, 0x20, 0xf0, 0x9d,
0x94, 0xbd, 0xf0, 0x9d, 0x95, 0x8c, 0xc5, 0x87, 0xe2, 0x84, 0x95, 0xef, 0xbd, 0x99,
0x20, 0xc5, 0xa3, 0xe4, 0xb9, 0x87, 0xf0, 0x9d, 0x95, 0x8f, 0xf0, 0x9d, 0x93, 0xa3};
decancer_cured_t cured = decancer_cure(input, sizeof(input), DECANCER_OPTION_DEFAULT, &error);
if (cured == NULL) {
fprintf(stderr, "curing error: %.*s\n", (int)error.message_length, error.message);
return 1;
}
decancer_assert(decancer_contains(cured, "funny", 5), "decancer_contains");
END:
return ret;
}
A library that removes common unicode confusables/homoglyphs from strings.
DECANCER_EXPORT void decancer_cured_free(decancer_cured_t cured)
Frees the cured string object created by decancer_cure and decancer_cure_utf16.
#define DECANCER_OPTION_DEFAULT
Uses decancer's default options – AKA to be AS AGGRESSIVE AS POSSIBLE.
Definition decancer.h:90
void * decancer_cured_t
Represents a cured string returned from decancer_cure and decancer_cure_utf16.
Definition decancer.h:498
DECANCER_EXPORT bool decancer_contains(decancer_cured_t cured, const uint8_t *other_str, const size_t other_size)
Checks if the cured string similarly contains the specified UTF-8 encoded string.
DECANCER_EXPORT decancer_cured_t decancer_cure(const uint8_t *input_str, const size_t input_size, const decancer_options_t options, decancer_error_t *error)
Cures a UTF-8 encoded string.
Represents an error caused by decancer not being able to cure a string.
Definition decancer.h:289
uint8_t message_length
The length of the error message.
Definition decancer.h:300
const char * message
Null-terminated ASCII encoded error message.
Definition decancer.h:294

UTF-16 example:

#include <decancer.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#define decancer_assert(expr, notes) \
if (!(expr)) { \
fprintf(stderr, "assertion failure at " notes "\n"); \
ret = 1; \
goto END; \
}
int main(void) {
int ret = 0;
// UTF-16 bytes for "vοΌ₯ⓑ𝔂 π”½π•ŒΕ‡β„•ο½™ ţ乇𝕏𝓣"
uint16_t input[] = {
0x0076, 0xff25, 0x24e1,
0xd835, 0xdd02, 0x0020,
0xd835, 0xdd3d, 0xd835,
0xdd4c, 0x0147, 0x2115,
0xff59, 0x0020, 0x0163,
0x4e47, 0xd835, 0xdd4f,
0xd835, 0xdce3
};
// UTF-16 bytes for "funny"
uint16_t funny[] = { 0x66, 0x75, 0x6e, 0x6e, 0x79 };
decancer_cured_t cured = decancer_cure_utf16(input, sizeof(input) / sizeof(uint16_t), DECANCER_OPTION_DEFAULT, &error);
if (cured == NULL) {
fprintf(stderr, "curing error: %.*s\n", (int)error.message_length, error.message);
return 1;
}
decancer_assert(decancer_contains_utf16(cured, funny, sizeof(funny) / sizeof(uint16_t)), "decancer_contains_utf16");
END:
return ret;
}
DECANCER_EXPORT bool decancer_contains_utf16(decancer_cured_t cured, const uint16_t *other_str, const size_t other_length)
Checks if the cured string similarly contains the specified UTF-16 encoded string.
DECANCER_EXPORT decancer_cured_t decancer_cure_utf16(const uint16_t *input_str, const size_t input_length, const decancer_options_t options, decancer_error_t *error)
Cures a UTF-16 encoded string.

Donations

If you want to support my eyes for manually looking at thousands of unicode characters, consider donating! ❀

ko-fi