libs/unicode/sortkey.c - wine - Git at Google

 /*
  * Unicode sort key generation
  *
  * Copyright 2003 Dmitry Timoshkov
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include "wine/unicode.h"

 extern int get_decomposition(WCHAR src, WCHAR *dst, unsigned int dstlen);
 extern const unsigned int collation_table[];

 /*
  * flags - normalization NORM_* flags
  *
  * FIXME: 'variable' flag not handled
  */
 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
 {
     WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
     int key_len[4];
     char *key_ptr[4];
     const WCHAR *src_save = src;
     int srclen_save = srclen;

     key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
     for (; srclen; srclen--, src++)
     {
         int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/
         dummy[0] = *src;
         if (decomposed_len)
         {
             int i;
             for (i = 0; i < decomposed_len; i++)
             {
                 WCHAR wch = dummy[i];
                 unsigned int ce;

                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
                  * and skips white space and punctuation characters for
                  * NORM_IGNORESYMBOLS.
                  */
                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
                     continue;

                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);

                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
                 if (ce != (unsigned int)-1)
                 {
                     if (ce >> 16) key_len[0] += 2;
                     if ((ce >> 8) & 0xff) key_len[1]++;
                     if ((ce >> 4) & 0x0f) key_len[2]++;
                     if (ce & 1)
                     {
                         if (wch >> 8) key_len[3]++;
                         key_len[3]++;
                     }
                 }
                 else
                 {
                     key_len[0] += 2;
                     if (wch >> 8) key_len[0]++;
                     if (wch & 0xff) key_len[0]++;
 		}
             }
         }
     }

     if (!dstlen) /* compute length */
         /* 4 * '\1' + 1 * '\0' + key length */
         return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1;

     if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
         return 0; /* overflow */

     src = src_save;
     srclen = srclen_save;

     key_ptr[0] = dst;
     key_ptr[1] = key_ptr[0] + key_len[0] + 1;
     key_ptr[2] = key_ptr[1] + key_len[1] + 1;
     key_ptr[3] = key_ptr[2] + key_len[2] + 1;

     for (; srclen; srclen--, src++)
     {
         int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/
         dummy[0] = *src;
         if (decomposed_len)
         {
             int i;
             for (i = 0; i < decomposed_len; i++)
             {
                 WCHAR wch = dummy[i];
                 unsigned int ce;

                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
                  * and skips white space and punctuation characters for
                  * NORM_IGNORESYMBOLS.
                  */
                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
                     continue;

                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);

                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
                 if (ce != (unsigned int)-1)
                 {
                     WCHAR key;
                     if ((key = ce >> 16))
                     {
                         *key_ptr[0]++ = key >> 8;
                         *key_ptr[0]++ = key & 0xff;
                     }
                     /* make key 1 start from 2 */
                     if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
                     /* make key 2 start from 2 */
                     if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
                     /* key 3 is always a character code */
                     if (ce & 1)
                     {
                         if (wch >> 8) *key_ptr[3]++ = wch >> 8;
                         if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
                     }
                 }
                 else
                 {
                     *key_ptr[0]++ = 0xff;
                     *key_ptr[0]++ = 0xfe;
                     if (wch >> 8) *key_ptr[0]++ = wch >> 8;
                     if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
                 }
             }
         }
     }

     *key_ptr[0] = '\1';
     *key_ptr[1] = '\1';
     *key_ptr[2] = '\1';
     *key_ptr[3]++ = '\1';
     *key_ptr[3] = 0;

     return key_ptr[3] - dst;
 }

 static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1,
                                           const WCHAR *str2, int len2)
 {
     unsigned int ce1, ce2;
     int ret;

     /* 32-bit collation element table format:
      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
      * case weight - high 4 bit of low 8 bit.
      */
     while (len1 > 0 && len2 > 0)
     {
         if (flags & NORM_IGNORESYMBOLS)
         {
             int skip = 0;
             /* FIXME: not tested */
             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
             {
                 str1++;
                 len1--;
                 skip = 1;
             }
             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
             {
                 str2++;
                 len2--;
                 skip = 1;
             }
             if (skip) continue;
         }

        /* hyphen and apostrophe are treated differently depending on
         * whether SORT_STRINGSORT specified or not
         */
         if (!(flags & SORT_STRINGSORT))
         {
             if (*str1 == '-' || *str1 == '\'')
             {
                 if (*str2 != '-' && *str2 != '\'')
                 {
                     str1++;
                     len1--;
                     continue;
                 }
             }
             else if (*str2 == '-' || *str2 == '\'')
             {
                 str2++;
                 len2--;
                 continue;
             }
         }

         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];

         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
             ret = (ce1 >> 16) - (ce2 >> 16);
         else
             ret = *str1 - *str2;

         if (ret) return ret;

         str1++;
         str2++;
         len1--;
         len2--;
     }
     return len1 - len2;
 }

 static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1,
                                             const WCHAR *str2, int len2)
 {
     unsigned int ce1, ce2;
     int ret;

     /* 32-bit collation element table format:
      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
      * case weight - high 4 bit of low 8 bit.
      */
     while (len1 > 0 && len2 > 0)
     {
         if (flags & NORM_IGNORESYMBOLS)
         {
             int skip = 0;
             /* FIXME: not tested */
             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
             {
                 str1++;
                 len1--;
                 skip = 1;
             }
             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
             {
                 str2++;
                 len2--;
                 skip = 1;
             }
             if (skip) continue;
         }

         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];

         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
             ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff);
         else
             ret = *str1 - *str2;

         if (ret) return ret;

         str1++;
         str2++;
         len1--;
         len2--;
     }
     return len1 - len2;
 }

 static inline int compare_case_weights(int flags, const WCHAR *str1, int len1,
                                        const WCHAR *str2, int len2)
 {
     unsigned int ce1, ce2;
     int ret;

     /* 32-bit collation element table format:
      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
      * case weight - high 4 bit of low 8 bit.
      */
     while (len1 > 0 && len2 > 0)
     {
         if (flags & NORM_IGNORESYMBOLS)
         {
             int skip = 0;
             /* FIXME: not tested */
             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
             {
                 str1++;
                 len1--;
                 skip = 1;
             }
             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
             {
                 str2++;
                 len2--;
                 skip = 1;
             }
             if (skip) continue;
         }

         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];

         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
             ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f);
         else
             ret = *str1 - *str2;

         if (ret) return ret;

         str1++;
         str2++;
         len1--;
         len2--;
     }
     return len1 - len2;
 }

 static inline int real_length(const WCHAR *str, int len)
 {
     int real_len = 0;
     while (len-- && *str++) real_len++;
     return real_len;
 }

 int wine_compare_string(int flags, const WCHAR *str1, int len1,
                         const WCHAR *str2, int len2)
 {
     int ret;

     len1 = real_length(str1, len1);
     len2 = real_length(str2, len2);

     ret = compare_unicode_weights(flags, str1, len1, str2, len2);
     if (!ret)
     {
         if (!(flags & NORM_IGNORENONSPACE))
             ret = compare_diacritic_weights(flags, str1, len1, str2, len2);
         if (!ret && !(flags & NORM_IGNORECASE))
             ret = compare_case_weights(flags, str1, len1, str2, len2);
     }
     return ret;
 }
	/*
	* Unicode sort key generation
	*
	* Copyright 2003 Dmitry Timoshkov
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with this library; if not, write to the Free Software
	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	*/
	#include "wine/unicode.h"

	extern int get_decomposition(WCHAR src, WCHAR *dst, unsigned int dstlen);
	extern const unsigned int collation_table[];

	/*
	* flags - normalization NORM_* flags
	*
	* FIXME: 'variable' flag not handled
	*/
	int wine_get_sortkey(int flags, const WCHAR src, int srclen, char dst, int dstlen)
	{
	WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
	int key_len[4];
	char *key_ptr[4];
	const WCHAR *src_save = src;
	int srclen_save = srclen;

	key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
	for (; srclen; srclen--, src++)
	{
	int decomposed_len = 1;/get_decomposition(src, dummy, 4);*/
	dummy[0] = *src;
	if (decomposed_len)
	{
	int i;
	for (i = 0; i < decomposed_len; i++)
	{
	WCHAR wch = dummy[i];
	unsigned int ce;

	/* tests show that win2k just ignores NORM_IGNORENONSPACE,
	* and skips white space and punctuation characters for
	* NORM_IGNORESYMBOLS.
	*/
	if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT \| C1_SPACE)))
	continue;

	if (flags & NORM_IGNORECASE) wch = tolowerW(wch);

	ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
	if (ce != (unsigned int)-1)
	{
	if (ce >> 16) key_len[0] += 2;
	if ((ce >> 8) & 0xff) key_len[1]++;
	if ((ce >> 4) & 0x0f) key_len[2]++;
	if (ce & 1)
	{
	if (wch >> 8) key_len[3]++;
	key_len[3]++;
	}
	}
	else
	{
	key_len[0] += 2;
	if (wch >> 8) key_len[0]++;
	if (wch & 0xff) key_len[0]++;
	}
	}
	}
	}

	if (!dstlen) /* compute length */
	/* 4 * '\1' + 1 * '\0' + key length */
	return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1;

	if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
	return 0; /* overflow */

	src = src_save;
	srclen = srclen_save;

	key_ptr[0] = dst;
	key_ptr[1] = key_ptr[0] + key_len[0] + 1;
	key_ptr[2] = key_ptr[1] + key_len[1] + 1;
	key_ptr[3] = key_ptr[2] + key_len[2] + 1;

	for (; srclen; srclen--, src++)
	{
	int decomposed_len = 1;/get_decomposition(src, dummy, 4);*/
	dummy[0] = *src;
	if (decomposed_len)
	{
	int i;
	for (i = 0; i < decomposed_len; i++)
	{
	WCHAR wch = dummy[i];
	unsigned int ce;

	/* tests show that win2k just ignores NORM_IGNORENONSPACE,
	* and skips white space and punctuation characters for
	* NORM_IGNORESYMBOLS.
	*/
	if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT \| C1_SPACE)))
	continue;

	if (flags & NORM_IGNORECASE) wch = tolowerW(wch);

	ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
	if (ce != (unsigned int)-1)
	{
	WCHAR key;
	if ((key = ce >> 16))
	{
	*key_ptr[0]++ = key >> 8;
	*key_ptr[0]++ = key & 0xff;
	}
	/* make key 1 start from 2 */
	if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
	/* make key 2 start from 2 */
	if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
	/* key 3 is always a character code */
	if (ce & 1)
	{
	if (wch >> 8) *key_ptr[3]++ = wch >> 8;
	if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
	}
	}
	else
	{
	*key_ptr[0]++ = 0xff;
	*key_ptr[0]++ = 0xfe;
	if (wch >> 8) *key_ptr[0]++ = wch >> 8;
	if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
	}
	}
	}
	}

	*key_ptr[0] = '\1';
	*key_ptr[1] = '\1';
	*key_ptr[2] = '\1';
	*key_ptr[3]++ = '\1';
	*key_ptr[3] = 0;

	return key_ptr[3] - dst;
	}

	static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1,
	const WCHAR *str2, int len2)
	{
	unsigned int ce1, ce2;
	int ret;

	/* 32-bit collation element table format:
	* unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
	* case weight - high 4 bit of low 8 bit.
	*/
	while (len1 > 0 && len2 > 0)
	{
	if (flags & NORM_IGNORESYMBOLS)
	{
	int skip = 0;
	/* FIXME: not tested */
	if (get_char_typeW(*str1) & (C1_PUNCT \| C1_SPACE))
	{
	str1++;
	len1--;
	skip = 1;
	}
	if (get_char_typeW(*str2) & (C1_PUNCT \| C1_SPACE))
	{
	str2++;
	len2--;
	skip = 1;
	}
	if (skip) continue;
	}

	/* hyphen and apostrophe are treated differently depending on
	* whether SORT_STRINGSORT specified or not
	*/
	if (!(flags & SORT_STRINGSORT))
	{
	if (str1 == '-' \|\| str1 == '\'')
	{
	if (str2 != '-' && str2 != '\'')
	{
	str1++;
	len1--;
	continue;
	}
	}
	else if (str2 == '-' \|\| str2 == '\'')
	{
	str2++;
	len2--;
	continue;
	}
	}

	ce1 = collation_table[collation_table[str1 >> 8] + (str1 & 0xff)];
	ce2 = collation_table[collation_table[str2 >> 8] + (str2 & 0xff)];

	if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
	ret = (ce1 >> 16) - (ce2 >> 16);
	else
	ret = str1 - str2;

	if (ret) return ret;

	str1++;
	str2++;
	len1--;
	len2--;
	}
	return len1 - len2;
	}

	static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1,
	const WCHAR *str2, int len2)
	{
	unsigned int ce1, ce2;
	int ret;

	/* 32-bit collation element table format:
	* unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
	* case weight - high 4 bit of low 8 bit.
	*/
	while (len1 > 0 && len2 > 0)
	{
	if (flags & NORM_IGNORESYMBOLS)
	{
	int skip = 0;
	/* FIXME: not tested */
	if (get_char_typeW(*str1) & (C1_PUNCT \| C1_SPACE))
	{
	str1++;
	len1--;
	skip = 1;
	}
	if (get_char_typeW(*str2) & (C1_PUNCT \| C1_SPACE))
	{
	str2++;
	len2--;
	skip = 1;
	}
	if (skip) continue;
	}

	ce1 = collation_table[collation_table[str1 >> 8] + (str1 & 0xff)];
	ce2 = collation_table[collation_table[str2 >> 8] + (str2 & 0xff)];

	if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
	ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff);
	else
	ret = str1 - str2;

	if (ret) return ret;

	str1++;
	str2++;
	len1--;
	len2--;
	}
	return len1 - len2;
	}

	static inline int compare_case_weights(int flags, const WCHAR *str1, int len1,
	const WCHAR *str2, int len2)
	{
	unsigned int ce1, ce2;
	int ret;

	/* 32-bit collation element table format:
	* unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
	* case weight - high 4 bit of low 8 bit.
	*/
	while (len1 > 0 && len2 > 0)
	{
	if (flags & NORM_IGNORESYMBOLS)
	{
	int skip = 0;
	/* FIXME: not tested */
	if (get_char_typeW(*str1) & (C1_PUNCT \| C1_SPACE))
	{
	str1++;
	len1--;
	skip = 1;
	}
	if (get_char_typeW(*str2) & (C1_PUNCT \| C1_SPACE))
	{
	str2++;
	len2--;
	skip = 1;
	}
	if (skip) continue;
	}

	ce1 = collation_table[collation_table[str1 >> 8] + (str1 & 0xff)];
	ce2 = collation_table[collation_table[str2 >> 8] + (str2 & 0xff)];

	if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
	ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f);
	else
	ret = str1 - str2;

	if (ret) return ret;

	str1++;
	str2++;
	len1--;
	len2--;
	}
	return len1 - len2;
	}

	static inline int real_length(const WCHAR *str, int len)
	{
	int real_len = 0;
	while (len-- && *str++) real_len++;
	return real_len;
	}

	int wine_compare_string(int flags, const WCHAR *str1, int len1,
	const WCHAR *str2, int len2)
	{
	int ret;

	len1 = real_length(str1, len1);
	len2 = real_length(str2, len2);

	ret = compare_unicode_weights(flags, str1, len1, str2, len2);
	if (!ret)
	{
	if (!(flags & NORM_IGNORENONSPACE))
	ret = compare_diacritic_weights(flags, str1, len1, str2, len2);
	if (!ret && !(flags & NORM_IGNORECASE))
	ret = compare_case_weights(flags, str1, len1, str2, len2);
	}
	return ret;
	}