diff options
Diffstat (limited to 'lib/enca/devel-docs/xml/internal.xml')
-rw-r--r-- | lib/enca/devel-docs/xml/internal.xml | 1328 |
1 files changed, 0 insertions, 1328 deletions
diff --git a/lib/enca/devel-docs/xml/internal.xml b/lib/enca/devel-docs/xml/internal.xml deleted file mode 100644 index 37fafc197a..0000000000 --- a/lib/enca/devel-docs/xml/internal.xml +++ /dev/null @@ -1,1328 +0,0 @@ -<?xml version="1.0"?> -<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd"> -<refentry id="libenca-Internal-Functions"> -<refmeta> -<refentrytitle role="top_of_page" id="libenca-Internal-Functions.top_of_page">internal</refentrytitle> -<manvolnum>3</manvolnum> -<refmiscinfo>LIBENCA Library</refmiscinfo> -</refmeta> - -<refnamediv> -<refname>internal</refname> -<refpurpose><para> -internal functions -</para></refpurpose> -</refnamediv> - -<refsynopsisdiv id="libenca-Internal-Functions.synopsis" role="synopsis"> -<title role="synopsis.title">Synopsis</title> - -<synopsis> - <link linkend="EncaAnalyserOptions">EncaAnalyserOptions</link>; - <link linkend="EncaAnalyserState">EncaAnalyserState</link>; - <link linkend="EncaCharsetInfo">EncaCharsetInfo</link>; - <link linkend="EncaLanguageInfo">EncaLanguageInfo</link>; - <link linkend="EncaLanguageHookData1CS">EncaLanguageHookData1CS</link>; - <link linkend="EncaLanguageHookDataEOL">EncaLanguageHookDataEOL</link>; -<link linkend="int">int</link> (<link linkend="EncaHookFunc">*EncaHookFunc</link>) (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); -<link linkend="int">int</link> (<link linkend="EncaGuessFunc">*EncaGuessFunc</link>) (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); - <link linkend="EncaUTFCheckData">EncaUTFCheckData</link>; -#define <link linkend="ELEMENTS--CAPS">ELEMENTS</link> (array) -#define <link linkend="MAKE-HOOK-LINE--CAPS">MAKE_HOOK_LINE</link> (name) -#define <link linkend="EPSILON--CAPS">EPSILON</link> -#define <link linkend="FILL-NONLETTER--CAPS">FILL_NONLETTER</link> -#define <link linkend="LF--CAPS">LF</link> -#define <link linkend="CR--CAPS">CR</link> -<link linkend="void">void</link>* <link linkend="enca-malloc">enca_malloc</link> (<link linkend="size-t">size_t</link> size); -<link linkend="void">void</link>* <link linkend="enca-realloc">enca_realloc</link> (<link linkend="void">void</link> *ptr, - <link linkend="size-t">size_t</link> size); -#define <link linkend="enca-free">enca_free</link> (ptr) -#define <link linkend="NEW--CAPS">NEW</link> (type,n) -#define <link linkend="RENEW--CAPS">RENEW</link> (ptr,type,n) -<link linkend="char">char</link>* <link linkend="enca-strdup">enca_strdup</link> (const <link linkend="char">char</link> *s); -const <link linkend="char">char</link>* <link linkend="enca-strstr">enca_strstr</link> (const <link linkend="char">char</link> *haystack, - const <link linkend="char">char</link> *needle); -<link linkend="char">char</link>* <link linkend="enca-stpcpy">enca_stpcpy</link> (<link linkend="char">char</link> *dest, - const <link linkend="char">char</link> *src); -<link linkend="char">char</link>* <link linkend="enca-strconcat">enca_strconcat</link> (const <link linkend="char">char</link> *str, - ...); -<link linkend="char">char</link>* <link linkend="enca-strappend">enca_strappend</link> (<link linkend="char">char</link> *str, - ...); -#define <link linkend="enca-csname">enca_csname</link> (cs) -<link linkend="int">int</link> <link linkend="enca-name-to-charset">enca_name_to_charset</link> (const <link linkend="char">char</link> *csname); -<link linkend="EncaSurface">EncaSurface</link> <link linkend="enca-name-to-surface">enca_name_to_surface</link> (const <link linkend="char">char</link> *sname); -<link linkend="int">int</link> <link linkend="enca-language-init">enca_language_init</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser, - const <link linkend="char">char</link> *langname); -<link linkend="void">void</link> <link linkend="enca-language-destroy">enca_language_destroy</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); -<link linkend="double">double</link>* <link linkend="enca-get-charset-similarity-matrix">enca_get_charset_similarity_matrix</link> (const <link linkend="EncaLanguageInfo">EncaLanguageInfo</link> *lang); -<link linkend="int">int</link> <link linkend="enca-charsets-subset-identical">enca_charsets_subset_identical</link> (<link linkend="int">int</link> charset1, - <link linkend="int">int</link> charset2, - const <link linkend="size-t">size_t</link> *counts); -<link linkend="size-t">size_t</link> <link linkend="enca-filter-boxdraw">enca_filter_boxdraw</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser, - unsigned <link linkend="char">char</link> fill_char); -<link linkend="int">int</link> <link linkend="enca-language-hook-ncs">enca_language_hook_ncs</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser, - <link linkend="size-t">size_t</link> ncs, - <link linkend="EncaLanguageHookData1CS">EncaLanguageHookData1CS</link> *hookdata); -<link linkend="int">int</link> <link linkend="enca-language-hook-eol">enca_language_hook_eol</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser, - <link linkend="size-t">size_t</link> ncs, - <link linkend="EncaLanguageHookDataEOL">EncaLanguageHookDataEOL</link> *hookdata); -<link linkend="void">void</link> <link linkend="enca-guess-init">enca_guess_init</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); -<link linkend="void">void</link> <link linkend="enca-guess-destroy">enca_guess_destroy</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); -<link linkend="EncaSurface">EncaSurface</link> <link linkend="enca-eol-surface">enca_eol_surface</link> (unsigned <link linkend="char">char</link> *buffer, - <link linkend="size-t">size_t</link> size, - const <link linkend="size-t">size_t</link> *counts); -<link linkend="void">void</link> <link linkend="enca-find-max-sec">enca_find_max_sec</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-BE--CAPS">ENCA_LANGUAGE_BE</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-BG--CAPS">ENCA_LANGUAGE_BG</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-CS--CAPS">ENCA_LANGUAGE_CS</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-ET--CAPS">ENCA_LANGUAGE_ET</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-HR--CAPS">ENCA_LANGUAGE_HR</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-HU--CAPS">ENCA_LANGUAGE_HU</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-LT--CAPS">ENCA_LANGUAGE_LT</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-LV--CAPS">ENCA_LANGUAGE_LV</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-PL--CAPS">ENCA_LANGUAGE_PL</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-RU--CAPS">ENCA_LANGUAGE_RU</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-SK--CAPS">ENCA_LANGUAGE_SK</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-SL--CAPS">ENCA_LANGUAGE_SL</link>; -extern const EncaLanguageInfo <link linkend="ENCA-LANGUAGE-UK--CAPS">ENCA_LANGUAGE_UK</link>; -<link linkend="void">void</link> <link linkend="enca-double-utf8-init">enca_double_utf8_init</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); -<link linkend="void">void</link> <link linkend="enca-double-utf8-destroy">enca_double_utf8_destroy</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); -<link linkend="void">void</link> <link linkend="enca-pair-init">enca_pair_init</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); -<link linkend="void">void</link> <link linkend="enca-pair-destroy">enca_pair_destroy</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); -<link linkend="int">int</link> <link linkend="enca-pair-analyse">enca_pair_analyse</link> (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser); -</synopsis> -</refsynopsisdiv> - - - - - - - - - -<refsect1 id="libenca-Internal-Functions.description" role="desc"> -<title role="desc.title">Description</title> -<para> -Do not use outside Enca library. -</para> -</refsect1> - -<refsect1 id="libenca-Internal-Functions.details" role="details"> -<title role="details.title">Details</title> -<refsect2 id="EncaAnalyserOptions" role="struct"> -<title>EncaAnalyserOptions</title> -<indexterm zone="EncaAnalyserOptions"><primary sortas="EncaAnalyserOptions">EncaAnalyserOptions</primary></indexterm><programlisting>typedef struct { - int const_buffer; - size_t min_chars; - double threshold; - int multibyte_enabled; - int interpreted_surfaces; - int ambiguous_mode; - int filtering; - int test_garbageness; - int termination_strictness; -} EncaAnalyserOptions; -</programlisting> -<para> -Analyser options, a part of analyser state.</para> -<para> -</para><variablelist role="struct"> -<varlistentry> -<term><link linkend="int">int</link> <structfield>const_buffer</structfield>;</term> -<listitem><simpara> Treat buffer as const? Otherwise its content can be, - and probably will be, modified. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>min_chars</structfield>;</term> -<listitem><simpara> Minimal number significant characters. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="double">double</link> <structfield>threshold</structfield>;</term> -<listitem><simpara> Minimal ratio between winner and the second. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>multibyte_enabled</structfield>;</term> -<listitem><simpara> Check for multibyte encodings? -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>interpreted_surfaces</structfield>;</term> -<listitem><simpara> Allow surfaces causing fundamental reinterpretation? -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>ambiguous_mode</structfield>;</term> -<listitem><simpara> Ambiguous mode? -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>filtering</structfield>;</term> -<listitem><simpara> Allow binary and box-drawing filters? -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>test_garbageness</structfield>;</term> -<listitem><simpara> Do test garbageness? -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>termination_strictness</structfield>;</term> -<listitem><simpara> Disallow broken multibyte sequences at buffer end? -</simpara></listitem> -</varlistentry> -</variablelist></refsect2> -<refsect2 id="EncaAnalyserState" role="struct"> -<title>EncaAnalyserState</title> -<indexterm zone="EncaAnalyserState"><primary sortas="EncaAnalyserState">EncaAnalyserState</primary></indexterm><programlisting>typedef struct { - /* Language data. */ - const EncaLanguageInfo *lang; - size_t ncharsets; - int *charsets; - /* Analyser state. */ - EncaErrno gerrno; - size_t size; - unsigned char *buffer; - EncaEncoding result; - size_t *counts; - size_t bin; - size_t up; - double *ratings; - size_t *order; - size_t size2; - unsigned char *buffer2; - /* Double-UTF-8 data. */ - EncaUTFCheckData *utfch; - int *utfbuf; - /* Pair frequency data */ - unsigned char *pair2bits; - size_t *bitcounts; - size_t *pairratings; - /* LCUC data XXX: unused (yet) */ - size_t *lcbits; - size_t *ucbits; - /* Options. */ - EncaAnalyserOptions options; -} EncaAnalyserState; -</programlisting> -<para> -The internal analyser state. -</para> -<para> -Passed as an opaque object (`this') to analyser calls.</para> -<para> -</para><variablelist role="struct"> -<varlistentry> -<term>const <link linkend="EncaLanguageInfo">EncaLanguageInfo</link> *<structfield>lang</structfield>;</term> -<listitem><simpara> Language informations. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>ncharsets</structfield>;</term> -<listitem><simpara> Number of 8bit charsets in this language. - (Equal to <parameter>lang</parameter>->ncharsets.) -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> *<structfield>charsets</structfield>;</term> -<listitem><simpara> 8bit charset id's [<parameter>ncharsets</parameter>]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="EncaErrno">EncaErrno</link> <structfield>gerrno</structfield>;</term> -<listitem><simpara> Guessing gerrno. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>size</structfield>;</term> -<listitem><simpara> Size of buffer. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term>unsigned <link linkend="char">char</link> *<structfield>buffer</structfield>;</term> -<listitem><simpara> Buffer whose encoding is to be detected [<parameter>size</parameter>]. - (Owned by outer world.) -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="EncaEncoding">EncaEncoding</link> <structfield>result</structfield>;</term> -<listitem><simpara> Result returned to caller. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> *<structfield>counts</structfield>;</term> -<listitem><simpara> Character counts [0x100]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>bin</structfield>;</term> -<listitem><simpara> Number of `binary' characters. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>up</structfield>;</term> -<listitem><simpara> Number of 8bit characters. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="double">double</link> *<structfield>ratings</structfield>;</term> -<listitem><simpara> 8bit charset ratings [<parameter>ncharsets</parameter>]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> *<structfield>order</structfield>;</term> -<listitem><simpara> Charset indices (not id's) sorted by ratings in descending order - [ncharsets]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>size2</structfield>;</term> -<listitem><simpara> Size of buffer2. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term>unsigned <link linkend="char">char</link> *<structfield>buffer2</structfield>;</term> -<listitem><simpara> A temporary secondary buffer [<parameter>size2</parameter>]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="EncaUTFCheckData">EncaUTFCheckData</link> *<structfield>utfch</structfield>;</term> -<listitem><simpara> Double-UTF-8 test data [<parameter>ncharsets</parameter>]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> *<structfield>utfbuf</structfield>;</term> -<listitem><simpara> Double-UTF-8 buffer for various UCS-2 character counting [0x10000]. - (Magic: see <link linkend="mark-scratch-buffer"><function>mark_scratch_buffer()</function></link> for description.) -</simpara></listitem> -</varlistentry> -<varlistentry> -<term>unsigned <link linkend="char">char</link> *<structfield>pair2bits</structfield>;</term> -<listitem><simpara> Character pair map to charsets [0x100000] (indexed - 0x100*first + second). Each bit corresponds to one charset, - when set, the pair is `good' for the given charset. The - type is char, so it breaks for <parameter>ncharsets</parameter> > 8, but it should - not be accessed from outer world, so it can be easily enlarged - to more bits. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> *<structfield>bitcounts</structfield>;</term> -<listitem><simpara> Counts for each possible bit combinations in <parameter>pair2bits</parameter> - [0x1 << ncharsets]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> *<structfield>pairratings</structfield>;</term> -<listitem><simpara> Counts of `good' pairs per charset [<parameter>ncharsets</parameter>]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> *<structfield>lcbits</structfield>;</term> -<listitem><simpara> If a character is lowercase in some charset, correspinding bit - is set [0x100]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> *<structfield>ucbits</structfield>;</term> -<listitem><simpara> If a character is uppercase in some charset, correspinding bit - is set [0x100]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="EncaAnalyserOptions">EncaAnalyserOptions</link> <structfield>options</structfield>;</term> -<listitem><simpara> Analyser options. -</simpara></listitem> -</varlistentry> -</variablelist></refsect2> -<refsect2 id="EncaCharsetInfo" role="struct"> -<title>EncaCharsetInfo</title> -<indexterm zone="EncaCharsetInfo"><primary sortas="EncaCharsetInfo">EncaCharsetInfo</primary></indexterm><programlisting>typedef struct { - int enca; - int rfc1345; - int cstocs; - int iconv; - int mime; - const char *human; - unsigned int flags; - unsigned int nsurface; -} EncaCharsetInfo; -</programlisting> -<para> -General charset informnations. -</para> -<para> -All the <link linkend="int"><type>int</type></link> fields are indices in <link linkend="ALIAS-LIST--CAPS"><type>ALIAS_LIST</type></link>[].</para> -<para> -</para><variablelist role="struct"> -<varlistentry> -<term><link linkend="int">int</link> <structfield>enca</structfield>;</term> -<listitem><simpara> Default, implicit name in enca. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>rfc1345</structfield>;</term> -<listitem><simpara> RFC1345 charset name. - (For charsets not in RFC1345, some canonical name is invented.) -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>cstocs</structfield>;</term> -<listitem><simpara> Cstocs charset name or -1. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>iconv</structfield>;</term> -<listitem><simpara> Iconv charset name or -1. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>mime</structfield>;</term> -<listitem><simpara> Preferred MIME charset name or -1. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term>const <link linkend="char">char</link> *<structfield>human</structfield>;</term> -<listitem><simpara> Human comprehensible description. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term>unsigned <link linkend="int">int</link> <structfield>flags</structfield>;</term> -<listitem><simpara> Charset properties (7bit, 8bit, multibyte, ...). -</simpara></listitem> -</varlistentry> -<varlistentry> -<term>unsigned <link linkend="int">int</link> <structfield>nsurface</structfield>;</term> -<listitem><simpara> Natural surface (`implied' in recode). -</simpara></listitem> -</varlistentry> -</variablelist></refsect2> -<refsect2 id="EncaLanguageInfo" role="struct"> -<title>EncaLanguageInfo</title> -<indexterm zone="EncaLanguageInfo"><primary sortas="EncaLanguageInfo">EncaLanguageInfo</primary></indexterm><programlisting>typedef struct { - const char *name; - const char *humanname; - size_t ncharsets; - const char *const *csnames; - const unsigned short int *const *weights; - const unsigned short int *significant; - const unsigned char *const *letters; - const unsigned char **const *pairs; - long int weight_sum; - EncaHookFunc hook; - EncaHookFunc eolhook; - EncaHookFunc lcuchook; - EncaHookFunc ratinghook; -} EncaLanguageInfo; -</programlisting> -<para> -Language specific data.</para> -<para> -</para><variablelist role="struct"> -<varlistentry> -<term>const <link linkend="char">char</link> *<structfield>name</structfield>;</term> -<listitem><simpara> Language name, or more precisely, locale name. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term>const <link linkend="char">char</link> *<structfield>humanname</structfield>;</term> -<listitem><simpara> Normal human-readable [English] language name. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>ncharsets</structfield>;</term> -<listitem><simpara> Number of charsets in this language. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term>const <link linkend="char">char</link> *const  *<structfield>csnames</structfield>;</term> -<listitem><simpara> Charset names [<parameter>ncharsets</parameter>]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term>long <link linkend="int">int</link> <structfield>weight_sum</structfield>;</term> -<listitem><simpara> Sum of all <parameter>weights</parameter> (is the same for all charsets). -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="EncaHookFunc">EncaHookFunc</link> <structfield>hook</structfield>;</term> -<listitem><simpara> Hook function (deciding hard cases). -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="EncaHookFunc">EncaHookFunc</link> <structfield>eolhook</structfield>;</term> -<listitem><simpara> EOL hook function (deciding ambiguous cases based on EOL type). -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="EncaHookFunc">EncaHookFunc</link> <structfield>lcuchook</structfield>;</term> -<listitem><simpara> -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="EncaHookFunc">EncaHookFunc</link> <structfield>ratinghook</structfield>;</term> -<listitem><simpara> Helper to calculate ratings for weightingless languages. -</simpara></listitem> -</varlistentry> -</variablelist></refsect2> -<refsect2 id="EncaLanguageHookData1CS" role="struct"> -<title>EncaLanguageHookData1CS</title> -<indexterm zone="EncaLanguageHookData1CS"><primary sortas="EncaLanguageHookData1CS">EncaLanguageHookData1CS</primary></indexterm><programlisting>typedef struct { - const char *name; - size_t size; - const unsigned char *list; - size_t cs; -} EncaLanguageHookData1CS; -</programlisting> -<para> -Cointainer for data needed by <link linkend="enca-language-hook-ncs"><function>enca_language_hook_ncs()</function></link>.</para> -<para> -</para><variablelist role="struct"> -<varlistentry> -<term>const <link linkend="char">char</link> *<structfield>name</structfield>;</term> -<listitem><simpara> Charset name. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>size</structfield>;</term> -<listitem><simpara> Number of characters in <parameter>list</parameter>. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>cs</structfield>;</term> -<listitem><simpara> Charset number. This is an index in <parameter>analyser</parameter> arrays (like <parameter>charsets</parameter>), - NOT a charset id. -</simpara></listitem> -</varlistentry> -</variablelist></refsect2> -<refsect2 id="EncaLanguageHookDataEOL" role="struct"> -<title>EncaLanguageHookDataEOL</title> -<indexterm zone="EncaLanguageHookDataEOL"><primary sortas="EncaLanguageHookDataEOL">EncaLanguageHookDataEOL</primary></indexterm><programlisting>typedef struct { - const char *name; - EncaSurface eol; - size_t cs; -} EncaLanguageHookDataEOL; -</programlisting> -<para> -Cointainer for data needed by <link linkend="enca-language-hook-eol"><function>enca_language_hook_eol()</function></link>.</para> -<para> -</para><variablelist role="struct"> -<varlistentry> -<term>const <link linkend="char">char</link> *<structfield>name</structfield>;</term> -<listitem><simpara> Charset name. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="EncaSurface">EncaSurface</link> <structfield>eol</structfield>;</term> -<listitem><simpara> The corresponding <link linkend="EncaSurface"><type>EncaSurface</type></link> bit. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>cs</structfield>;</term> -<listitem><simpara> Charset number. This is an index in <parameter>analyser</parameter> arrays (like <parameter>charsets</parameter>), - NOT a charset id. -</simpara></listitem> -</varlistentry> -</variablelist></refsect2> -<refsect2 id="EncaHookFunc" role="function"> -<title>EncaHookFunc ()</title> -<indexterm zone="EncaHookFunc"><primary sortas="EncaHookFunc">EncaHookFunc</primary></indexterm><programlisting><link linkend="int">int</link> (*EncaHookFunc) (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Language hook function type. -</para> -<para> -Launches language specific hooks for a particular language.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyser state whose charset ratings are to be modified. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Nonzero if charset ratigns have been actually modified, zero -otherwise. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="EncaGuessFunc" role="function"> -<title>EncaGuessFunc ()</title> -<indexterm zone="EncaGuessFunc"><primary sortas="EncaGuessFunc">EncaGuessFunc</primary></indexterm><programlisting><link linkend="int">int</link> (*EncaGuessFunc) (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Special (multibyte) encoding check function type.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyser state whose buffer should be checked. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Nonzero if analyser->result has been set, zero otherwise. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="EncaUTFCheckData" role="struct"> -<title>EncaUTFCheckData</title> -<indexterm zone="EncaUTFCheckData"><primary sortas="EncaUTFCheckData">EncaUTFCheckData</primary></indexterm><programlisting>typedef struct { - double rating; - size_t size; - int result; - int *ucs2; - int *weights; -} EncaUTFCheckData; -</programlisting> -<para> -Data needed by double-UTF-8 check, per language charset.</para> -<para> -</para><variablelist role="struct"> -<varlistentry> -<term><link linkend="double">double</link> <structfield>rating</structfield>;</term> -<listitem><simpara> Total rating for this charset. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="size-t">size_t</link> <structfield>size</structfield>;</term> -<listitem><simpara> Number of UCS-2 characters. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> <structfield>result</structfield>;</term> -<listitem><simpara> Nonzero when the sample is probably Doubly-UTF-8 encoded from - this charset. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> *<structfield>ucs2</structfield>;</term> -<listitem><simpara> List of significant UCS-2 characters, in order [<parameter>size</parameter>]. -</simpara></listitem> -</varlistentry> -<varlistentry> -<term><link linkend="int">int</link> *<structfield>weights</structfield>;</term> -<listitem><simpara> Weights for double-UTF-8 check [<parameter>size</parameter>]. Positive means normal - UTF-8, negative doubly-encoded. -</simpara></listitem> -</varlistentry> -</variablelist></refsect2> -<refsect2 id="ELEMENTS--CAPS" role="macro"> -<title>ELEMENTS()</title> -<indexterm zone="ELEMENTS--CAPS"><primary sortas="ELEMENTS">ELEMENTS</primary></indexterm><programlisting>#define ELEMENTS(array) (sizeof(array)/sizeof((array)[0])) -</programlisting> -<para> -Compute the number of elements of a static array.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>array</parameter> :</term> -<listitem><simpara> An array whose size is to be computed. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> the number of elements. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="MAKE-HOOK-LINE--CAPS" role="macro"> -<title>MAKE_HOOK_LINE()</title> -<indexterm zone="MAKE-HOOK-LINE--CAPS"><primary sortas="MAKE_HOOK_LINE">MAKE_HOOK_LINE</primary></indexterm><programlisting>#define MAKE_HOOK_LINE(name)</programlisting> -<para> -Ugly code `beautifier' macro for language hooks.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>name</parameter> :</term> -<listitem><simpara> A charset name in C-style identifier suitable form. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="EPSILON--CAPS" role="macro"> -<title>EPSILON</title> -<indexterm zone="EPSILON--CAPS"><primary sortas="EPSILON">EPSILON</primary></indexterm><programlisting>#define EPSILON 0.000001 -</programlisting> -<para> -`Zero' for float comparsion (and to prevent division by zero, etc.).</para> -<para> -</para></refsect2> -<refsect2 id="FILL-NONLETTER--CAPS" role="macro"> -<title>FILL_NONLETTER</title> -<indexterm zone="FILL-NONLETTER--CAPS"><primary sortas="FILL_NONLETTER">FILL_NONLETTER</primary></indexterm><programlisting>#define FILL_NONLETTER '.' -</programlisting> -<para> -Replacement character for non-letters in pair frequencies.</para> -<para> -</para></refsect2> -<refsect2 id="LF--CAPS" role="macro"> -<title>LF</title> -<indexterm zone="LF--CAPS"><primary sortas="LF">LF</primary></indexterm><programlisting>#define LF ((unsigned char)'\n') -</programlisting> -<para> -Line feed character (End-of-line on Unix).</para> -<para> -</para></refsect2> -<refsect2 id="CR--CAPS" role="macro"> -<title>CR</title> -<indexterm zone="CR--CAPS"><primary sortas="CR">CR</primary></indexterm><programlisting>#define CR ((unsigned char)'\r') -</programlisting> -<para> -Carriage return character (End-of-line on Macintosh).</para> -<para> -</para></refsect2> -<refsect2 id="enca-malloc" role="function"> -<title>enca_malloc ()</title> -<indexterm zone="enca-malloc"><primary sortas="enca_malloc">enca_malloc</primary></indexterm><programlisting><link linkend="void">void</link>* enca_malloc (<link linkend="size-t">size_t</link> size);</programlisting> -<para> -Allocates memory, always successfully (when fails, aborts program).</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>size</parameter> :</term> -<listitem><simpara> The number of bytes to allocate. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Pointer to the newly allocated memory. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-realloc" role="function"> -<title>enca_realloc ()</title> -<indexterm zone="enca-realloc"><primary sortas="enca_realloc">enca_realloc</primary></indexterm><programlisting><link linkend="void">void</link>* enca_realloc (<link linkend="void">void</link> *ptr, - <link linkend="size-t">size_t</link> size);</programlisting> -<para> -Reallocates memory, always successfully (when fails, aborts program).</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>ptr</parameter> :</term> -<listitem><simpara> Pointer to block of previously allocated memory. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>size</parameter> :</term> -<listitem><simpara> The number of bytes to resize the block. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Pointer to the newly allocated memory, <link linkend="NULL--CAPS"><type>NULL</type></link> when <parameter>size</parameter> is zero. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-free" role="macro"> -<title>enca_free()</title> -<indexterm zone="enca-free"><primary sortas="enca_free">enca_free</primary></indexterm><programlisting>#define enca_free(ptr)</programlisting> -<para> -Frees memory pointed by <parameter>ptr</parameter> with <link linkend="free"><function>free()</function></link> hack and assigns it a safe value, -thus may be called more than once. -</para> -<para> -<parameter>ptr</parameter> MUST be l-value.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>ptr</parameter> :</term> -<listitem><simpara> Pointer to memory to free. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="NEW--CAPS" role="macro"> -<title>NEW()</title> -<indexterm zone="NEW--CAPS"><primary sortas="NEW">NEW</primary></indexterm><programlisting>#define NEW(type,n) ((type*)enca_malloc((n)*sizeof(type))) -</programlisting> -<para> -An <link linkend="enca-malloc"><function>enca_malloc()</function></link> wrapper.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>type</parameter> :</term> -<listitem><simpara> Data type to allocate. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>n</parameter> :</term> -<listitem><simpara> Number of elements to allocate. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Pointer to the newly allocated memory. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="RENEW--CAPS" role="macro"> -<title>RENEW()</title> -<indexterm zone="RENEW--CAPS"><primary sortas="RENEW">RENEW</primary></indexterm><programlisting>#define RENEW(ptr,type,n) ((type*)enca_realloc((ptr),(n)*sizeof(type))) -</programlisting> -<para> -An <link linkend="enca-realloc"><function>enca_realloc()</function></link> wrapper.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>ptr</parameter> :</term> -<listitem><simpara> Pointer to already allocate memory or <link linkend="NULL--CAPS"><type>NULL</type></link>. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>type</parameter> :</term> -<listitem><simpara> Data type to allocate. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>n</parameter> :</term> -<listitem><simpara> Number of elements to resize the memory to. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Pointer to the reallocated memory (or pointer safe to call <link linkend="free"><function>free()</function></link> -on when <parameter>n</parameter> is zero). -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-strdup" role="function"> -<title>enca_strdup ()</title> -<indexterm zone="enca-strdup"><primary sortas="enca_strdup">enca_strdup</primary></indexterm><programlisting><link linkend="char">char</link>* enca_strdup (const <link linkend="char">char</link> *s);</programlisting> -<para> -Duplicates string. -</para> -<para> -Will be defined as <link linkend="strdup"><function>strdup()</function></link> when system provides it.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>s</parameter> :</term> -<listitem><simpara> A string. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> The newly allocated string copy. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-strstr" role="function"> -<title>enca_strstr ()</title> -<indexterm zone="enca-strstr"><primary sortas="enca_strstr">enca_strstr</primary></indexterm><programlisting>const <link linkend="char">char</link>* enca_strstr (const <link linkend="char">char</link> *haystack, - const <link linkend="char">char</link> *needle);</programlisting> -<para> -Finds occurence of a substring in a string. -</para> -<para> -Will be defined as <link linkend="strstr"><function>strstr()</function></link> when system provides it.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>haystack</parameter> :</term> -<listitem><simpara> A string where to search. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>needle</parameter> :</term> -<listitem><simpara> A string to find. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Pointer to the first occurence of <parameter>needle</parameter> in <parameter>haystack</parameter>; <link linkend="NULL--CAPS"><type>NULL</type></link> if - not found. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-stpcpy" role="function"> -<title>enca_stpcpy ()</title> -<indexterm zone="enca-stpcpy"><primary sortas="enca_stpcpy">enca_stpcpy</primary></indexterm><programlisting><link linkend="char">char</link>* enca_stpcpy (<link linkend="char">char</link> *dest, - const <link linkend="char">char</link> *src);</programlisting> -<para> -Appends a string to the end of another strings, returning pointer to -the terminating zero byte. -</para> -<para> -Will be defined as <link linkend="stpcpy"><function>stpcpy()</function></link> when system provides it. -</para> -<para> -Caller is responisble for providing <parameter>dest</parameter> long enough to hold the result.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>dest</parameter> :</term> -<listitem><simpara> A string. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>src</parameter> :</term> -<listitem><simpara> A string to append. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Pointer to the terminating zero byte of resulting string. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-strconcat" role="function"> -<title>enca_strconcat ()</title> -<indexterm zone="enca-strconcat"><primary sortas="enca_strconcat">enca_strconcat</primary></indexterm><programlisting><link linkend="char">char</link>* enca_strconcat (const <link linkend="char">char</link> *str, - ...);</programlisting> -<para> -Concatenates arbitrary (but at least one) number of strings.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>str</parameter> :</term> -<listitem><simpara> A string. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>...</parameter> :</term> -<listitem><simpara> A <link linkend="NULL-terminated"><type>NULL-terminated</type></link> list of string to append. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> All the strings concatenated together. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-strappend" role="function"> -<title>enca_strappend ()</title> -<indexterm zone="enca-strappend"><primary sortas="enca_strappend">enca_strappend</primary></indexterm><programlisting><link linkend="char">char</link>* enca_strappend (<link linkend="char">char</link> *str, - ...);</programlisting> -<para> -Appends arbitrary number of strings to a string. -</para> -<para> -The string <parameter>str</parameter> is destroyed (reallocated), the others are kept.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>str</parameter> :</term> -<listitem><simpara> A string. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>...</parameter> :</term> -<listitem><simpara> A <link linkend="NULL-terminated"><type>NULL-terminated</type></link> list of string to append. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> All the strings concatenated together. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-csname" role="macro"> -<title>enca_csname()</title> -<indexterm zone="enca-csname"><primary sortas="enca_csname">enca_csname</primary></indexterm><programlisting>#define enca_csname(cs) enca_charset_name((cs), ENCA_NAME_STYLE_ENCA) -</programlisting> -<para> -A shorthand for printing names with <link linkend="ENCA-NAME-STYLE-ENCA--CAPS"><type>ENCA_NAME_STYLE_ENCA</type></link>.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>cs</parameter> :</term> -<listitem><simpara> A charset id. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-name-to-charset" role="function"> -<title>enca_name_to_charset ()</title> -<indexterm zone="enca-name-to-charset"><primary sortas="enca_name_to_charset">enca_name_to_charset</primary></indexterm><programlisting><link linkend="int">int</link> enca_name_to_charset (const <link linkend="char">char</link> *csname);</programlisting> -<para> -Transforms charset name to numeric charset id.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>csname</parameter> :</term> -<listitem><simpara> The charset name. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> The charset id; <link linkend="ENCA-CS-UNKNOWN--CAPS"><type>ENCA_CS_UNKNOWN</type></link> when the name is not recognized. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-name-to-surface" role="function"> -<title>enca_name_to_surface ()</title> -<indexterm zone="enca-name-to-surface"><primary sortas="enca_name_to_surface">enca_name_to_surface</primary></indexterm><programlisting><link linkend="EncaSurface">EncaSurface</link> enca_name_to_surface (const <link linkend="char">char</link> *sname);</programlisting> -<para> -Transforms surface name to numeric surface id.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>sname</parameter> :</term> -<listitem><simpara> The surface name. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> The surface id; <link linkend="ENCA-SURFACE-UNKNOWN--CAPS"><literal>ENCA_SURFACE_UNKNOWN</literal></link> when the name is not -recognized. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-language-init" role="function"> -<title>enca_language_init ()</title> -<indexterm zone="enca-language-init"><primary sortas="enca_language_init">enca_language_init</primary></indexterm><programlisting><link linkend="int">int</link> enca_language_init (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser, - const <link linkend="char">char</link> *langname);</programlisting> -<para> -Initializes analyser for language <parameter>langname</parameter>. -</para> -<para> -Assumes <parameter>analyser</parameter> is unitinialized, calling with an initialized <parameter>analyser</parameter> -leads to memory leak.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyzer state to be initialized for this language. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>langname</parameter> :</term> -<listitem><simpara> Two-letter ISO-639 language code. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Nonzero on success, zero otherwise. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-language-destroy" role="function"> -<title>enca_language_destroy ()</title> -<indexterm zone="enca-language-destroy"><primary sortas="enca_language_destroy">enca_language_destroy</primary></indexterm><programlisting><link linkend="void">void</link> enca_language_destroy (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Destroys the language part of analyser state <parameter>analyser</parameter>.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyzer state whose language part should be destroyed. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-get-charset-similarity-matrix" role="function"> -<title>enca_get_charset_similarity_matrix ()</title> -<indexterm zone="enca-get-charset-similarity-matrix"><primary sortas="enca_get_charset_similarity_matrix">enca_get_charset_similarity_matrix</primary></indexterm><programlisting><link linkend="double">double</link>* enca_get_charset_similarity_matrix (const <link linkend="EncaLanguageInfo">EncaLanguageInfo</link> *lang);</programlisting> -<para> -Computes character weight similarity matrix for language <parameter>lang</parameter>. -</para> -<para> -sim[i,j] is normalized to sim[i,i] thus: -- a row i contains ,probabilities` different languages will look like the - i-th one -- a column i contains ,probabilities` the i-th language will look like - the other languages. -</para> -<para> -For all practical applications, the higher one of sim[i,j] and sim[j,i] -is important. -</para> -<para> -Note: this is not used anywhere, only by simtable.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>lang</parameter> :</term> -<listitem><simpara> A language. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> The matrix, its size is determined by <parameter>lang</parameter>->ncharsets; <link linkend="NULL--CAPS"><type>NULL</type></link> - for language with no charsets. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-charsets-subset-identical" role="function"> -<title>enca_charsets_subset_identical ()</title> -<indexterm zone="enca-charsets-subset-identical"><primary sortas="enca_charsets_subset_identical">enca_charsets_subset_identical</primary></indexterm><programlisting><link linkend="int">int</link> enca_charsets_subset_identical (<link linkend="int">int</link> charset1, - <link linkend="int">int</link> charset2, - const <link linkend="size-t">size_t</link> *counts);</programlisting> -<para> -Checks whether all characters with nonzero count have the same meaning -in both charsets. -</para> -<para> -In other words, it checks whether conversion of sample containing only -these characters from <parameter>charset1</parameter> to <parameter>charset2</parameter> would be identity.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>charset1</parameter> :</term> -<listitem><simpara> A charset. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>charset2</parameter> :</term> -<listitem><simpara> Another charset. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>counts</parameter> :</term> -<listitem><simpara> An array of size 0x100 containing character counts. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Nonzero if charsets are identical on the subset, zero otherwise. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-filter-boxdraw" role="function"> -<title>enca_filter_boxdraw ()</title> -<indexterm zone="enca-filter-boxdraw"><primary sortas="enca_filter_boxdraw">enca_filter_boxdraw</primary></indexterm><programlisting><link linkend="size-t">size_t</link> enca_filter_boxdraw (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser, - unsigned <link linkend="char">char</link> fill_char);</programlisting> -<para> -Runs boxdrawing characters filter on <parameter>buffer</parameter> for each charset in <parameter>language</parameter>.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyser whose charsets should be considered for filtration. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>fill_char</parameter> :</term> -<listitem><simpara> Replacement character for filtered bytes. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Number of characters filtered out. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-language-hook-ncs" role="function"> -<title>enca_language_hook_ncs ()</title> -<indexterm zone="enca-language-hook-ncs"><primary sortas="enca_language_hook_ncs">enca_language_hook_ncs</primary></indexterm><programlisting><link linkend="int">int</link> enca_language_hook_ncs (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser, - <link linkend="size-t">size_t</link> ncs, - <link linkend="EncaLanguageHookData1CS">EncaLanguageHookData1CS</link> *hookdata);</programlisting> -<para> -Decide between two charsets differing only in a few characters. -</para> -<para> -If the two most probable charsets correspond to <parameter>hookdata</parameter> charsets, -give the characters they differ half the weight of all other characters -together, thus allowing to decide between the two very similar charsets. -</para> -<para> -It also recomputes <parameter>order</parameter> when something changes.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyser whose charset ratings are to be modified. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>ncs</parameter> :</term> -<listitem><simpara> The number of charsets. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>hookdata</parameter> :</term> -<listitem><simpara> What characters of which charsets should be given the extra - weight. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Nonzero when <parameter>ratings</parameter> were actually modified, nonzero otherwise. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-language-hook-eol" role="function"> -<title>enca_language_hook_eol ()</title> -<indexterm zone="enca-language-hook-eol"><primary sortas="enca_language_hook_eol">enca_language_hook_eol</primary></indexterm><programlisting><link linkend="int">int</link> enca_language_hook_eol (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser, - <link linkend="size-t">size_t</link> ncs, - <link linkend="EncaLanguageHookDataEOL">EncaLanguageHookDataEOL</link> *hookdata);</programlisting> -<para> -Decide between two charsets differing only in EOL type or other surface. -</para> -<para> -The (surface mask, charset) pairs are scanned in order. If a matching -surface is found, ratings of all other charsets in the list are zeroed. -So you can place a surface mask of all 1s at the end to match when nothing -else matches. -</para> -<para> -All the charsets have to have the same rating, or nothing happens. -</para> -<para> -It also recomputes <parameter>order</parameter> when something changes.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyser whose charset ratings are to be modified. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>ncs</parameter> :</term> -<listitem><simpara> The number of charsets. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>hookdata</parameter> :</term> -<listitem><simpara> What characters of which charsets should be decided with based - on the EOL type. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Nonzero when <parameter>ratings</parameter> were actually modified, nonzero otherwise. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-guess-init" role="function"> -<title>enca_guess_init ()</title> -<indexterm zone="enca-guess-init"><primary sortas="enca_guess_init">enca_guess_init</primary></indexterm><programlisting><link linkend="void">void</link> enca_guess_init (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Allocates and initializes analyser state, sets options to defaults. -</para> -<para> -Assumes <parameter>analyser</parameter> is unitinialized, calling with an initialized <parameter>analyser</parameter> -leads to memory leak, but <parameter>analyser</parameter>->lang must be already initialized.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyser to initialize. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-guess-destroy" role="function"> -<title>enca_guess_destroy ()</title> -<indexterm zone="enca-guess-destroy"><primary sortas="enca_guess_destroy">enca_guess_destroy</primary></indexterm><programlisting><link linkend="void">void</link> enca_guess_destroy (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Frees memory owned by analyser state.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyser to destroy. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-eol-surface" role="function"> -<title>enca_eol_surface ()</title> -<indexterm zone="enca-eol-surface"><primary sortas="enca_eol_surface">enca_eol_surface</primary></indexterm><programlisting><link linkend="EncaSurface">EncaSurface</link> enca_eol_surface (unsigned <link linkend="char">char</link> *buffer, - <link linkend="size-t">size_t</link> size, - const <link linkend="size-t">size_t</link> *counts);</programlisting> -<para> -Find EOL type of sample in <parameter>buffer</parameter>.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>buffer</parameter> :</term> -<listitem><simpara> A buffer whose EOL type is to be detected. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>size</parameter> :</term> -<listitem><simpara> Size of <parameter>buffer</parameter>. -</simpara></listitem></varlistentry> -<varlistentry><term><parameter>counts</parameter> :</term> -<listitem><simpara> Character counts. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> The EOL surface flags. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-find-max-sec" role="function"> -<title>enca_find_max_sec ()</title> -<indexterm zone="enca-find-max-sec"><primary sortas="enca_find_max_sec">enca_find_max_sec</primary></indexterm><programlisting><link linkend="void">void</link> enca_find_max_sec (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Updates <parameter>analyser</parameter>->order according to charset <parameter>ratings</parameter>. -</para> -<para> -XXX: This should be stable sort. The ordering is defined by -data/<lang>/<lang>.h header file which is in turn defined by odering in -the appropriate script (doit.sh). Silly. -</para> -<para> -Must not be called with <parameter>analyser</parameter> with no regular charsets.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> An analyser. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="ENCA-LANGUAGE-BE--CAPS" role="variable"> -<title>ENCA_LANGUAGE_BE</title> -<indexterm zone="ENCA-LANGUAGE-BE--CAPS"><primary sortas="ENCA_LANGUAGE_BE">ENCA_LANGUAGE_BE</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_BE; -</programlisting> -<para> -Belarussian language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-BG--CAPS" role="variable"> -<title>ENCA_LANGUAGE_BG</title> -<indexterm zone="ENCA-LANGUAGE-BG--CAPS"><primary sortas="ENCA_LANGUAGE_BG">ENCA_LANGUAGE_BG</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_BG; -</programlisting> -<para> -Bulgarian language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-CS--CAPS" role="variable"> -<title>ENCA_LANGUAGE_CS</title> -<indexterm zone="ENCA-LANGUAGE-CS--CAPS"><primary sortas="ENCA_LANGUAGE_CS">ENCA_LANGUAGE_CS</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_CS; -</programlisting> -<para> -Czech language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-ET--CAPS" role="variable"> -<title>ENCA_LANGUAGE_ET</title> -<indexterm zone="ENCA-LANGUAGE-ET--CAPS"><primary sortas="ENCA_LANGUAGE_ET">ENCA_LANGUAGE_ET</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_ET; -</programlisting> -<para> -Estonian language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-HR--CAPS" role="variable"> -<title>ENCA_LANGUAGE_HR</title> -<indexterm zone="ENCA-LANGUAGE-HR--CAPS"><primary sortas="ENCA_LANGUAGE_HR">ENCA_LANGUAGE_HR</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_HR; -</programlisting> -<para> -Croatian language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-HU--CAPS" role="variable"> -<title>ENCA_LANGUAGE_HU</title> -<indexterm zone="ENCA-LANGUAGE-HU--CAPS"><primary sortas="ENCA_LANGUAGE_HU">ENCA_LANGUAGE_HU</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_HU; -</programlisting> -<para> -Hungarian language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-LT--CAPS" role="variable"> -<title>ENCA_LANGUAGE_LT</title> -<indexterm zone="ENCA-LANGUAGE-LT--CAPS"><primary sortas="ENCA_LANGUAGE_LT">ENCA_LANGUAGE_LT</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_LT; -</programlisting> -<para> -Lithuanian language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-LV--CAPS" role="variable"> -<title>ENCA_LANGUAGE_LV</title> -<indexterm zone="ENCA-LANGUAGE-LV--CAPS"><primary sortas="ENCA_LANGUAGE_LV">ENCA_LANGUAGE_LV</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_LV; -</programlisting> -<para> -Latvian language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-PL--CAPS" role="variable"> -<title>ENCA_LANGUAGE_PL</title> -<indexterm zone="ENCA-LANGUAGE-PL--CAPS"><primary sortas="ENCA_LANGUAGE_PL">ENCA_LANGUAGE_PL</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_PL; -</programlisting> -<para> -Polish language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-RU--CAPS" role="variable"> -<title>ENCA_LANGUAGE_RU</title> -<indexterm zone="ENCA-LANGUAGE-RU--CAPS"><primary sortas="ENCA_LANGUAGE_RU">ENCA_LANGUAGE_RU</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_RU; -</programlisting> -<para> -Russian language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-SK--CAPS" role="variable"> -<title>ENCA_LANGUAGE_SK</title> -<indexterm zone="ENCA-LANGUAGE-SK--CAPS"><primary sortas="ENCA_LANGUAGE_SK">ENCA_LANGUAGE_SK</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_SK; -</programlisting> -<para> -Slovak language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-SL--CAPS" role="variable"> -<title>ENCA_LANGUAGE_SL</title> -<indexterm zone="ENCA-LANGUAGE-SL--CAPS"><primary sortas="ENCA_LANGUAGE_SL">ENCA_LANGUAGE_SL</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_SL; -</programlisting> -<para> -Slovene language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="ENCA-LANGUAGE-UK--CAPS" role="variable"> -<title>ENCA_LANGUAGE_UK</title> -<indexterm zone="ENCA-LANGUAGE-UK--CAPS"><primary sortas="ENCA_LANGUAGE_UK">ENCA_LANGUAGE_UK</primary></indexterm><programlisting>extern const EncaLanguageInfo ENCA_LANGUAGE_UK; -</programlisting> -<para> -Ukrainian language. -</para> -<para> -Everything the world out there needs to know about this language.</para> -<para> -</para></refsect2> -<refsect2 id="enca-double-utf8-init" role="function"> -<title>enca_double_utf8_init ()</title> -<indexterm zone="enca-double-utf8-init"><primary sortas="enca_double_utf8_init">enca_double_utf8_init</primary></indexterm><programlisting><link linkend="void">void</link> enca_double_utf8_init (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Initializes double-UTF-8 check. -</para> -<para> -In fact it initializes the fields to <link linkend="NULL--CAPS"><type>NULL</type></link>'s, they are actually initialized -only when needed.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyzer state to be initialized. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-double-utf8-destroy" role="function"> -<title>enca_double_utf8_destroy ()</title> -<indexterm zone="enca-double-utf8-destroy"><primary sortas="enca_double_utf8_destroy">enca_double_utf8_destroy</primary></indexterm><programlisting><link linkend="void">void</link> enca_double_utf8_destroy (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Destroys the double-UTF-8 check part of analyser state <parameter>analyser</parameter>.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyzer state whose double-UTF-8 check part should be destroyed. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-pair-init" role="function"> -<title>enca_pair_init ()</title> -<indexterm zone="enca-pair-init"><primary sortas="enca_pair_init">enca_pair_init</primary></indexterm><programlisting><link linkend="void">void</link> enca_pair_init (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Initializes pair statistics data. -</para> -<para> -In fact it just sets everything to <link linkend="NULL--CAPS"><type>NULL</type></link>, to be initialized when needed.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyzer state to be initialized. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-pair-destroy" role="function"> -<title>enca_pair_destroy ()</title> -<indexterm zone="enca-pair-destroy"><primary sortas="enca_pair_destroy">enca_pair_destroy</primary></indexterm><programlisting><link linkend="void">void</link> enca_pair_destroy (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Destroys the pair statistics part of analyser state <parameter>analyser</parameter>.</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analyzer state whose pair statistics part should be destroyed. -</simpara></listitem></varlistentry> -</variablelist></refsect2> -<refsect2 id="enca-pair-analyse" role="function"> -<title>enca_pair_analyse ()</title> -<indexterm zone="enca-pair-analyse"><primary sortas="enca_pair_analyse">enca_pair_analyse</primary></indexterm><programlisting><link linkend="int">int</link> enca_pair_analyse (<link linkend="EncaAnalyserState">EncaAnalyserState</link> *analyser);</programlisting> -<para> -Performs pair-frequency based analysis, provided that the language supports -it (does nothing otherwise).</para> -<para> -</para><variablelist role="params"> -<varlistentry><term><parameter>analyser</parameter> :</term> -<listitem><simpara> Analysed containing the sample for pair frequency analysis. -</simpara></listitem></varlistentry> -<varlistentry><term><emphasis>Returns</emphasis> :</term><listitem><simpara> Nonzero when the character set was succesfully determined, - <parameter>analyser</parameter>-><parameter>result</parameter>.<parameter>charset</parameter> is then directly modified. -</simpara></listitem></varlistentry> -</variablelist></refsect2> - -</refsect1> - - - - -</refentry> |