|  | /* | 
|  | * Unicode sort key generation | 
|  | * | 
|  | * Copyright 2003 Dmitry Timoshkov | 
|  | * | 
|  | * This library is free software; you can redistribute it and/or | 
|  | * modify it under the terms of the GNU Lesser General Public | 
|  | * License as published by the Free Software Foundation; either | 
|  | * version 2.1 of the License, or (at your option) any later version. | 
|  | * | 
|  | * This library is distributed in the hope that it will be useful, | 
|  | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|  | * Lesser General Public License for more details. | 
|  | * | 
|  | * You should have received a copy of the GNU Lesser General Public | 
|  | * License along with this library; if not, write to the Free Software | 
|  | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA | 
|  | */ | 
|  | #include "wine/unicode.h" | 
|  |  | 
|  | extern int get_decomposition(WCHAR src, WCHAR *dst, unsigned int dstlen); | 
|  | extern const unsigned int collation_table[]; | 
|  |  | 
|  | /* | 
|  | * flags - normalization NORM_* flags | 
|  | * | 
|  | * FIXME: 'variable' flag not handled | 
|  | */ | 
|  | int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen) | 
|  | { | 
|  | WCHAR dummy[4]; /* no decomposition is larger than 4 chars */ | 
|  | int key_len[4]; | 
|  | char *key_ptr[4]; | 
|  | const WCHAR *src_save = src; | 
|  | int srclen_save = srclen; | 
|  |  | 
|  | key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0; | 
|  | for (; srclen; srclen--, src++) | 
|  | { | 
|  | int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/ | 
|  | dummy[0] = *src; | 
|  | if (decomposed_len) | 
|  | { | 
|  | int i; | 
|  | for (i = 0; i < decomposed_len; i++) | 
|  | { | 
|  | WCHAR wch = dummy[i]; | 
|  | unsigned int ce; | 
|  |  | 
|  | /* tests show that win2k just ignores NORM_IGNORENONSPACE, | 
|  | * and skips white space and punctuation characters for | 
|  | * NORM_IGNORESYMBOLS. | 
|  | */ | 
|  | if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE))) | 
|  | continue; | 
|  |  | 
|  | if (flags & NORM_IGNORECASE) wch = tolowerW(wch); | 
|  |  | 
|  | ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)]; | 
|  | if (ce != (unsigned int)-1) | 
|  | { | 
|  | if (ce >> 16) key_len[0] += 2; | 
|  | if ((ce >> 8) & 0xff) key_len[1]++; | 
|  | if ((ce >> 4) & 0x0f) key_len[2]++; | 
|  | if (ce & 1) | 
|  | { | 
|  | if (wch >> 8) key_len[3]++; | 
|  | key_len[3]++; | 
|  | } | 
|  | } | 
|  | else | 
|  | { | 
|  | key_len[0] += 2; | 
|  | if (wch >> 8) key_len[0]++; | 
|  | if (wch & 0xff) key_len[0]++; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if (!dstlen) /* compute length */ | 
|  | /* 4 * '\1' + 1 * '\0' + key length */ | 
|  | return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1; | 
|  |  | 
|  | if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1) | 
|  | return 0; /* overflow */ | 
|  |  | 
|  | src = src_save; | 
|  | srclen = srclen_save; | 
|  |  | 
|  | key_ptr[0] = dst; | 
|  | key_ptr[1] = key_ptr[0] + key_len[0] + 1; | 
|  | key_ptr[2] = key_ptr[1] + key_len[1] + 1; | 
|  | key_ptr[3] = key_ptr[2] + key_len[2] + 1; | 
|  |  | 
|  | for (; srclen; srclen--, src++) | 
|  | { | 
|  | int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/ | 
|  | dummy[0] = *src; | 
|  | if (decomposed_len) | 
|  | { | 
|  | int i; | 
|  | for (i = 0; i < decomposed_len; i++) | 
|  | { | 
|  | WCHAR wch = dummy[i]; | 
|  | unsigned int ce; | 
|  |  | 
|  | /* tests show that win2k just ignores NORM_IGNORENONSPACE, | 
|  | * and skips white space and punctuation characters for | 
|  | * NORM_IGNORESYMBOLS. | 
|  | */ | 
|  | if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE))) | 
|  | continue; | 
|  |  | 
|  | if (flags & NORM_IGNORECASE) wch = tolowerW(wch); | 
|  |  | 
|  | ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)]; | 
|  | if (ce != (unsigned int)-1) | 
|  | { | 
|  | WCHAR key; | 
|  | if ((key = ce >> 16)) | 
|  | { | 
|  | *key_ptr[0]++ = key >> 8; | 
|  | *key_ptr[0]++ = key & 0xff; | 
|  | } | 
|  | /* make key 1 start from 2 */ | 
|  | if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1; | 
|  | /* make key 2 start from 2 */ | 
|  | if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1; | 
|  | /* key 3 is always a character code */ | 
|  | if (ce & 1) | 
|  | { | 
|  | if (wch >> 8) *key_ptr[3]++ = wch >> 8; | 
|  | if (wch & 0xff) *key_ptr[3]++ = wch & 0xff; | 
|  | } | 
|  | } | 
|  | else | 
|  | { | 
|  | *key_ptr[0]++ = 0xff; | 
|  | *key_ptr[0]++ = 0xfe; | 
|  | if (wch >> 8) *key_ptr[0]++ = wch >> 8; | 
|  | if (wch & 0xff) *key_ptr[0]++ = wch & 0xff; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | *key_ptr[0] = '\1'; | 
|  | *key_ptr[1] = '\1'; | 
|  | *key_ptr[2] = '\1'; | 
|  | *key_ptr[3]++ = '\1'; | 
|  | *key_ptr[3] = 0; | 
|  |  | 
|  | return key_ptr[3] - dst; | 
|  | } | 
|  |  | 
|  | static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1, | 
|  | const WCHAR *str2, int len2) | 
|  | { | 
|  | unsigned int ce1, ce2; | 
|  | int ret; | 
|  |  | 
|  | /* 32-bit collation element table format: | 
|  | * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, | 
|  | * case weight - high 4 bit of low 8 bit. | 
|  | */ | 
|  | while (len1 > 0 && len2 > 0) | 
|  | { | 
|  | if (flags & NORM_IGNORESYMBOLS) | 
|  | { | 
|  | int skip = 0; | 
|  | /* FIXME: not tested */ | 
|  | if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) | 
|  | { | 
|  | str1++; | 
|  | len1--; | 
|  | skip = 1; | 
|  | } | 
|  | if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) | 
|  | { | 
|  | str2++; | 
|  | len2--; | 
|  | skip = 1; | 
|  | } | 
|  | if (skip) continue; | 
|  | } | 
|  |  | 
|  | /* hyphen and apostrophe are treated differently depending on | 
|  | * whether SORT_STRINGSORT specified or not | 
|  | */ | 
|  | if (!(flags & SORT_STRINGSORT)) | 
|  | { | 
|  | if (*str1 == '-' || *str1 == '\'') | 
|  | { | 
|  | if (*str2 != '-' && *str2 != '\'') | 
|  | { | 
|  | str1++; | 
|  | len1--; | 
|  | continue; | 
|  | } | 
|  | } | 
|  | else if (*str2 == '-' || *str2 == '\'') | 
|  | { | 
|  | str2++; | 
|  | len2--; | 
|  | continue; | 
|  | } | 
|  | } | 
|  |  | 
|  | ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; | 
|  | ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; | 
|  |  | 
|  | if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) | 
|  | ret = (ce1 >> 16) - (ce2 >> 16); | 
|  | else | 
|  | ret = *str1 - *str2; | 
|  |  | 
|  | if (ret) return ret; | 
|  |  | 
|  | str1++; | 
|  | str2++; | 
|  | len1--; | 
|  | len2--; | 
|  | } | 
|  | return len1 - len2; | 
|  | } | 
|  |  | 
|  | static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1, | 
|  | const WCHAR *str2, int len2) | 
|  | { | 
|  | unsigned int ce1, ce2; | 
|  | int ret; | 
|  |  | 
|  | /* 32-bit collation element table format: | 
|  | * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, | 
|  | * case weight - high 4 bit of low 8 bit. | 
|  | */ | 
|  | while (len1 > 0 && len2 > 0) | 
|  | { | 
|  | if (flags & NORM_IGNORESYMBOLS) | 
|  | { | 
|  | int skip = 0; | 
|  | /* FIXME: not tested */ | 
|  | if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) | 
|  | { | 
|  | str1++; | 
|  | len1--; | 
|  | skip = 1; | 
|  | } | 
|  | if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) | 
|  | { | 
|  | str2++; | 
|  | len2--; | 
|  | skip = 1; | 
|  | } | 
|  | if (skip) continue; | 
|  | } | 
|  |  | 
|  | ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; | 
|  | ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; | 
|  |  | 
|  | if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) | 
|  | ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff); | 
|  | else | 
|  | ret = *str1 - *str2; | 
|  |  | 
|  | if (ret) return ret; | 
|  |  | 
|  | str1++; | 
|  | str2++; | 
|  | len1--; | 
|  | len2--; | 
|  | } | 
|  | return len1 - len2; | 
|  | } | 
|  |  | 
|  | static inline int compare_case_weights(int flags, const WCHAR *str1, int len1, | 
|  | const WCHAR *str2, int len2) | 
|  | { | 
|  | unsigned int ce1, ce2; | 
|  | int ret; | 
|  |  | 
|  | /* 32-bit collation element table format: | 
|  | * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, | 
|  | * case weight - high 4 bit of low 8 bit. | 
|  | */ | 
|  | while (len1 > 0 && len2 > 0) | 
|  | { | 
|  | if (flags & NORM_IGNORESYMBOLS) | 
|  | { | 
|  | int skip = 0; | 
|  | /* FIXME: not tested */ | 
|  | if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) | 
|  | { | 
|  | str1++; | 
|  | len1--; | 
|  | skip = 1; | 
|  | } | 
|  | if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) | 
|  | { | 
|  | str2++; | 
|  | len2--; | 
|  | skip = 1; | 
|  | } | 
|  | if (skip) continue; | 
|  | } | 
|  |  | 
|  | ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; | 
|  | ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; | 
|  |  | 
|  | if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) | 
|  | ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f); | 
|  | else | 
|  | ret = *str1 - *str2; | 
|  |  | 
|  | if (ret) return ret; | 
|  |  | 
|  | str1++; | 
|  | str2++; | 
|  | len1--; | 
|  | len2--; | 
|  | } | 
|  | return len1 - len2; | 
|  | } | 
|  |  | 
|  | static inline int real_length(const WCHAR *str, int len) | 
|  | { | 
|  | int real_len = 0; | 
|  | while (len-- && *str++) real_len++; | 
|  | return real_len; | 
|  | } | 
|  |  | 
|  | int wine_compare_string(int flags, const WCHAR *str1, int len1, | 
|  | const WCHAR *str2, int len2) | 
|  | { | 
|  | int ret; | 
|  |  | 
|  | len1 = real_length(str1, len1); | 
|  | len2 = real_length(str2, len2); | 
|  |  | 
|  | ret = compare_unicode_weights(flags, str1, len1, str2, len2); | 
|  | if (!ret) | 
|  | { | 
|  | if (!(flags & NORM_IGNORENONSPACE)) | 
|  | ret = compare_diacritic_weights(flags, str1, len1, str2, len2); | 
|  | if (!ret && !(flags & NORM_IGNORECASE)) | 
|  | ret = compare_case_weights(flags, str1, len1, str2, len2); | 
|  | } | 
|  | return ret; | 
|  | } |