1f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 2f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*************************************************************************** 3f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Copyright (C) 2008-2009, International Business Machines Corporation 4f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* and others. All Rights Reserved. 5f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*************************************************************************** 6f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* file name: uspoof.cpp 7f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* encoding: US-ASCII 8f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* tab size: 8 (not used) 9f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* indentation:4 10f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* 11f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* created on: 2008Feb13 12f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* created by: Andy Heninger 13f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* 14f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Unicode Spoof Detection 15f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/ 16f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/utypes.h" 17f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/uspoof.h" 18f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/unorm.h" 19f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ustring.h" 20f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "cmemory.h" 21f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uspoof_impl.h" 22f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uassert.h" 23f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 24f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 25f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#if !UCONFIG_NO_NORMALIZATION 26f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 27f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 28f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include <stdio.h> // debug 29f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 30f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_NAMESPACE_USE 31f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 32f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 33f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI USpoofChecker * U_EXPORT2 34f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_open(UErrorCode *status) { 35f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 36f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 37f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 38f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status); 39f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 40f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete si; 41f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) si = NULL; 42f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 43f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return (USpoofChecker *)si; 44f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 45f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 46f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 47f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI USpoofChecker * U_EXPORT2 48f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, 49f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status) { 50f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 51f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 52f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 53f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofData *sd = new SpoofData(data, length, *status); 54f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofImpl *si = new SpoofImpl(sd, *status); 55f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 56f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete sd; 57f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete si; 58f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 59f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 60f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (sd == NULL || si == NULL) { 61f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_MEMORY_ALLOCATION_ERROR; 62f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete sd; 63f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete si; 64f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 65f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 66f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 67f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (pActualLength != NULL) { 68f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *pActualLength = sd->fRawData->fLength; 69f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 70f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return reinterpret_cast<USpoofChecker *>(si); 71f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 72f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 73f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 74f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI USpoofChecker * U_EXPORT2 75f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_clone(const USpoofChecker *sc, UErrorCode *status) { 76f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const SpoofImpl *src = SpoofImpl::validateThis(sc, *status); 77f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (src == NULL) { 78f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 79f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 80f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofImpl *result = new SpoofImpl(*src, *status); // copy constructor 81f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 82f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete result; 83f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result = NULL; 84f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 85f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return (USpoofChecker *)result; 86f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 87f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 88f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 89f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI void U_EXPORT2 90f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_close(USpoofChecker *sc) { 91f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 92f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofImpl *This = SpoofImpl::validateThis(sc, status); 93f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete This; 94f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 95f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 96f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 97f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI void U_EXPORT2 98f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) { 99f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This == NULL) { 101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Verify that the requested checks are all ones (bits) that 105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // are acceptable, known values. 106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (checks & ~USPOOF_ALL_CHECKS) { 107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ILLEGAL_ARGUMENT_ERROR; 108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) This->fChecks = checks; 112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t U_EXPORT2 116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) { 117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This == NULL) { 119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return This->fChecks; 122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI void U_EXPORT2 125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) { 126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This == NULL) { 128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) This->setAllowedLocales(localesList, *status); 131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI const char * U_EXPORT2 134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) { 135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This == NULL) { 137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return This->getAllowedLocales(*status); 140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 141f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 142f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 143f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI const USet * U_EXPORT2 144f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) { 145f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status); 146f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return reinterpret_cast<const USet *>(result); 147f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 148f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 149f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI const UnicodeSet * U_EXPORT2 150f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) { 151f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 152f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This == NULL) { 153f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 154f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 155f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return This->fAllowedCharsSet; 156f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 157f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 158f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 159f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI void U_EXPORT2 160f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) { 161f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars); 162f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uspoof_setAllowedUnicodeSet(sc, set, status); 163f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 164f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 165f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 166f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI void U_EXPORT2 167f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) { 168f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 169f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This == NULL) { 170f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 171f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 172f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (chars->isBogus()) { 173f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ILLEGAL_ARGUMENT_ERROR; 174f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 175f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 176f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone()); 177f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (clonedSet == NULL || clonedSet->isBogus()) { 178f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_MEMORY_ALLOCATION_ERROR; 179f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 180f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 181f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) clonedSet->freeze(); 182f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete This->fAllowedCharsSet; 183f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) This->fAllowedCharsSet = clonedSet; 184f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) This->fChecks |= USPOOF_CHAR_LIMIT; 185f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 186f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 187f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 188f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t U_EXPORT2 189f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_check(const USpoofChecker *sc, 190f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *text, int32_t length, 191f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t *position, 192f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status) { 193f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 194f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 195f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This == NULL) { 196f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 197f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 198f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (length < -1) { 199f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ILLEGAL_ARGUMENT_ERROR; 200f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 201f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 202f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (length == -1) { 203f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // It's not worth the bother to handle nul terminated strings everywhere. 204f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Just get the length and be done with it. 205f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) length = u_strlen(text); 206f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 207f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 208f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t result = 0; 209f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32? 210f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 211f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // A count of the number of non-Common or inherited scripts. 212f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests. 213f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Share the computation when possible. scriptCount == -1 means that we haven't 214f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // done it yet. 215f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t scriptCount = -1; 216f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 217f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) { 218f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) scriptCount = This->scriptScan(text, length, failPos, *status); 219f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // printf("scriptCount (clipped to 2) = %d\n", scriptCount); 220f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ( scriptCount >= 2) { 221f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Note: scriptCount == 2 covers all cases of the number of scripts >= 2 222f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result |= USPOOF_SINGLE_SCRIPT; 223f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 224f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 225f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 226f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This->fChecks & USPOOF_CHAR_LIMIT) { 227f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i; 228f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 c; 229f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (i=0; i<length ;) { 230f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U16_NEXT(text, i, length, c); 231f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (!This->fAllowedCharsSet->contains(c)) { 232f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result |= USPOOF_CHAR_LIMIT; 233f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (i < failPos) { 234f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) failPos = i; 235f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 236f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 237f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 238f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 239f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 240f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 241f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This->fChecks & 242f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { 243f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // These are the checks that need to be done on NFKD input 244f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) NFKDBuffer normalizedInput(text, length, *status); 245f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *nfkdText = normalizedInput.getBuffer(); 246f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t nfkdLength = normalizedInput.getLength(); 247f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 248f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This->fChecks & USPOOF_INVISIBLE) { 249f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 250f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // scan for more than one occurence of the same non-spacing mark 251f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // in a sequence of non-spacing marks. 252f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i; 253f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 c; 254f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 firstNonspacingMark = 0; 255f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool haveMultipleMarks = FALSE; 256f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. 257f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 258f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (i=0; i<length ;) { 259f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U16_NEXT(nfkdText, i, nfkdLength, c); 260f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (u_charType(c) != U_NON_SPACING_MARK) { 261f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) firstNonspacingMark = 0; 262f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (haveMultipleMarks) { 263f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) marksSeenSoFar.clear(); 264f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) haveMultipleMarks = FALSE; 265f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 266f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 267f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 268f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (firstNonspacingMark == 0) { 269f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) firstNonspacingMark = c; 270f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 271f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 272f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (!haveMultipleMarks) { 273f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) marksSeenSoFar.add(firstNonspacingMark); 274f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) haveMultipleMarks = TRUE; 275f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 276f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (marksSeenSoFar.contains(c)) { 277f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // report the error, and stop scanning. 278f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // No need to find more than the first failure. 279f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result |= USPOOF_INVISIBLE; 280f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) failPos = i; 281f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 282f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 283f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) marksSeenSoFar.add(c); 284f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 285f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 286f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 287f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 288f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) { 289f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The basic test is the same for both whole and mixed script confusables. 290f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Compute the set of scripts that every input character has a confusable in. 291f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // For this computation an input character is always considered to be 292f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // confusable with itself in its own script. 293f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // If the number of such scripts is two or more, and the input consisted of 294f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // characters all from a single script, we have a whole script confusable. 295f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // (The two scripts will be the original script and the one that is confusable) 296f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // If the number of such scripts >= one, and the original input contained characters from 297f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // more than one script, we have a mixed script confusable. (We can transform 298f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // some of the characters, and end up with a visually similar string all in 299f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // one script.) 300f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 301f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (scriptCount == -1) { 302f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t t; 303f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) scriptCount = This->scriptScan(text, length, t, *status); 304f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 305f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 306f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ScriptSet scripts; 307f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status); 308f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t confusableScriptCount = scripts.countMembers(); 309f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //printf("confusableScriptCount = %d\n", confusableScriptCount); 310f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 311f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) && 312f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) confusableScriptCount >= 2 && 313f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) scriptCount == 1) { 314f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; 315f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 316f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 317f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) && 318f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) confusableScriptCount >= 1 && 319f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) scriptCount > 1) { 320f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; 321f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 322f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 323f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 324f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (position != NULL && failPos != 0x7fffffff) { 325f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *position = failPos; 326f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 327f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return result; 328f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 329f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 330f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 331f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t U_EXPORT2 332f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_checkUTF8(const USpoofChecker *sc, 333f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const char *text, int32_t length, 334f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t *position, 335f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status) { 336f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 337f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 338f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 339f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 340f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar stackBuf[USPOOF_STACK_BUFFER_SIZE]; 341f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar* text16 = stackBuf; 342f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t len16; 343f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 344f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status); 345f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { 346f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 347f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 348f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (*status == U_BUFFER_OVERFLOW_ERROR) { 349f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2)); 350f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (text16 == NULL) { 351f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_MEMORY_ALLOCATION_ERROR; 352f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 353f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 354f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ZERO_ERROR; 355f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_strFromUTF8(text16, len16+1, NULL, text, length, status); 356f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 357f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 358f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t position16 = -1; 359f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t result = uspoof_check(sc, text16, len16, &position16, status); 360f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 361f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 362f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 363f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 364f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (position16 > 0) { 365f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Translate a UTF-16 based error position back to a UTF-8 offset. 366f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // u_strToUTF8() in preflight mode is an easy way to do it. 367f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U_ASSERT(position16 <= len16); 368f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_strToUTF8(NULL, 0, position, text16, position16, status); 369f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (position > 0) { 370f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // position is the required buffer length from u_strToUTF8, which includes 371f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // space for a terminating NULL, which we don't want, hence the -1. 372f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *position -= 1; 373f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 374f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ZERO_ERROR; // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR. 375f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 376f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 377f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (text16 != stackBuf) { 378f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(text16); 379f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 380f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return result; 381f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 382f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 383f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 384f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* A convenience wrapper around the public uspoof_getSkeleton that handles 385f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * allocating a larger buffer than provided if the original is too small. 386f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 387f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength, 388f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) { 389f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t requiredCapacity = 0; 390f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *buf = dest; 391f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 392f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 393f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 394f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 395f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status); 396f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (*status == U_BUFFER_OVERFLOW_ERROR) { 397f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf = static_cast<UChar *>(uprv_malloc(requiredCapacity * sizeof(UChar))); 398f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (buf == NULL) { 399f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_MEMORY_ALLOCATION_ERROR; 400f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 401f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 402f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ZERO_ERROR; 403f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status); 404f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 405f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *outputLength = requiredCapacity; 406f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return buf; 407f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 408f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 409f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 410f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t U_EXPORT2 411f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_areConfusable(const USpoofChecker *sc, 412f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s1, int32_t length1, 413f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s2, int32_t length2, 414f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status) { 415f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 416f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 417f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 418f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 419f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // 420f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, 421f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // and for definitions of the types (single, whole, mixed-script) of confusables. 422f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 423f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // We only care about a few of the check flags. Ignore the others. 424f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // If no tests relavant to this function have been specified, return an error. 425f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // TODO: is this really the right thing to do? It's probably an error on the caller's part, 426f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // but logically we would just return 0 (no error). 427f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | 428f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) { 429f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_INVALID_STATE_ERROR; 430f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 431f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 432f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE; 433f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE]; 434f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *s1Skeleton; 435f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t s1SkeletonLength = 0; 436f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 437f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE]; 438f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *s2Skeleton; 439f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t s2SkeletonLength = 0; 440f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 441f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t result = 0; 442f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t t; 443f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t s1ScriptCount = This->scriptScan(s1, length1, t, *status); 444f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t s2ScriptCount = This->scriptScan(s2, length2, t, *status); 445f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 446f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { 447f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Do the Single Script compare. 448f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (s1ScriptCount <= 1 && s2ScriptCount <= 1) { 449f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; 450f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf, 451f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status); 452f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf, 453f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status); 454f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) { 455f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; 456f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 457f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (s1Skeleton != s1SkeletonBuf) { 458f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(s1Skeleton); 459f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 460f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (s2Skeleton != s2SkeletonBuf) { 461f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(s2Skeleton); 462f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 463f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 464f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 465f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 466f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { 467f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // If the two inputs are single script confusable they cannot also be 468f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // mixed or whole script confusable, according to the UAX39 definitions. 469f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // So we can skip those tests. 470f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return result; 471f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 472f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 473f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Optimization for whole script confusables test: two identifiers are whole script confusable if 474f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // each is of a single script and they are mixed script confusable. 475f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool possiblyWholeScriptConfusables = 476f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE); 477f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 478f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // 479f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Mixed Script Check 480f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // 481f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) { 482f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us 483f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // the mixed script table skeleton, which is what we want. 484f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The Any Case / Lower Case bit in the skelton flags was set at the top of the function. 485f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE; 486f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf, 487f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status); 488f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf, 489f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status); 490f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) { 491f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; 492f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (possiblyWholeScriptConfusables) { 493f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; 494f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 495f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 496f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (s1Skeleton != s1SkeletonBuf) { 497f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(s1Skeleton); 498f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 499f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (s2Skeleton != s2SkeletonBuf) { 500f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(s2Skeleton); 501f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 502f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 503f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 504f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return result; 505f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 506f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 507f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 508f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Convenience function for converting a UTF-8 input to a UChar * string, including 509f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// reallocating a buffer when required. Parameters and their interpretation mostly 510f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// match u_strFromUTF8. 511f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 512f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength, 513f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const char *in, int32_t inLength, UErrorCode *status) { 514f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 515f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 516f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 517f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *dest = outBuf; 518f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status); 519f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (*status == U_BUFFER_OVERFLOW_ERROR) { 520f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) dest = static_cast<UChar *>(uprv_malloc(*outputLength * sizeof(UChar))); 521f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (dest == NULL) { 522f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_MEMORY_ALLOCATION_ERROR; 523f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return NULL; 524f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 525f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ZERO_ERROR; 526f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status); 527f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 528f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return dest; 529f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 530f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 531f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 532f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 533f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t U_EXPORT2 534f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_areConfusableUTF8(const USpoofChecker *sc, 535f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const char *s1, int32_t length1, 536f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const char *s2, int32_t length2, 537f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status) { 538f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 539f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofImpl::validateThis(sc, *status); 540f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 541f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 542f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 543f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 544f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar s1Buf[USPOOF_STACK_BUFFER_SIZE]; 545f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t lengthS1U; 546f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status); 547f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 548f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar s2Buf[USPOOF_STACK_BUFFER_SIZE]; 549f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t lengthS2U; 550f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status); 551f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 552f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status); 553f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 554f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (s1U != s1Buf) { 555f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(s1U); 556f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 557f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (s2U != s2Buf) { 558f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(s2U); 559f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 560f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return results; 561f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 562f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 563f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 564f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t U_EXPORT2 565f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_areConfusableUnicodeString(const USpoofChecker *sc, 566f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const U_NAMESPACE_QUALIFIER UnicodeString &s1, 567f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const U_NAMESPACE_QUALIFIER UnicodeString &s2, 568f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status) { 569f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 570f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *u1 = s1.getBuffer(); 571f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length1 = s1.length(); 572f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *u2 = s2.getBuffer(); 573f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t length2 = s2.length(); 574f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 575f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t results = uspoof_areConfusable(sc, u1, length1, u2, length2, status); 576f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return results; 577f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 578f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 579f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 580f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 581f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 582f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t U_EXPORT2 583f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_checkUnicodeString(const USpoofChecker *sc, 584f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const U_NAMESPACE_QUALIFIER UnicodeString &text, 585f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t *position, 586f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status) { 587f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status); 588f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return result; 589f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 590f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 591f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 592f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t U_EXPORT2 593f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_getSkeleton(const USpoofChecker *sc, 594f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint32_t type, 595f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *s, int32_t length, 596f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *dest, int32_t destCapacity, 597f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status) { 598f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 599f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // TODO: this function could be sped up a bit 600f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Skip the input normalization when not needed, work from callers data. 601f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Put the initial skeleton straight into the caller's destination buffer. 602f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // It probably won't need normalization. 603f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // But these would make the structure more complicated. 604f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 605f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 606f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 607f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 608f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 609f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) || 610f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) { 611f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ILLEGAL_ARGUMENT_ERROR; 612f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 613f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 614f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 615f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t tableMask = 0; 616f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) switch (type) { 617f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case 0: 618f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) tableMask = USPOOF_ML_TABLE_FLAG; 619f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 620f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case USPOOF_SINGLE_SCRIPT_CONFUSABLE: 621f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) tableMask = USPOOF_SL_TABLE_FLAG; 622f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 623f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case USPOOF_ANY_CASE: 624f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) tableMask = USPOOF_MA_TABLE_FLAG; 625f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 626f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE: 627f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) tableMask = USPOOF_SA_TABLE_FLAG; 628f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 629f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) default: 630f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ILLEGAL_ARGUMENT_ERROR; 631f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 632f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 633f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 634f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // NFKD transform of the user supplied input 635f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 636f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE]; 637f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *nfkdInput = nfkdStackBuf; 638f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t normalizedLen = unorm_normalize( 639f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status); 640f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (*status == U_BUFFER_OVERFLOW_ERROR) { 641f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) nfkdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar)); 642f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (nfkdInput == NULL) { 643f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_MEMORY_ALLOCATION_ERROR; 644f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 645f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 646f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ZERO_ERROR; 647f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) normalizedLen = unorm_normalize(s, length, UNORM_NFKD, 0, 648f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) nfkdInput, normalizedLen+1, status); 649f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 650f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 651f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (nfkdInput != nfkdStackBuf) { 652f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(nfkdInput); 653f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 654f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 655f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 656f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 657f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // buffer to hold the Unicode defined skeleton mappings for a single code point 658f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar buf[USPOOF_MAX_SKELETON_EXPANSION]; 659f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 660f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Apply the skeleton mapping to the NFKD normalized input string 661f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. 662f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t inputIndex = 0; 663f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString skelStr; 664f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (inputIndex < normalizedLen) { 665f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 c; 666f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U16_NEXT(nfkdInput, inputIndex, normalizedLen, c); 667f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t replaceLen = This->confusableLookup(c, tableMask, buf); 668f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) skelStr.append(buf, replaceLen); 669f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 670f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 671f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (nfkdInput != nfkdStackBuf) { 672f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(nfkdInput); 673f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 674f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 675f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *result = skelStr.getBuffer(); 676f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t resultLen = skelStr.length(); 677f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *normedResult = NULL; 678f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 679f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Check the skeleton for NFKD, normalize it if needed. 680f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Unnormalized results should be very rare. 681f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) { 682f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) normalizedLen = unorm_normalize(result, resultLen, UNORM_NFKD, 0, NULL, 0, status); 683f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar))); 684f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (normedResult == NULL) { 685f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_MEMORY_ALLOCATION_ERROR; 686f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 687f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 688f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ZERO_ERROR; 689f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status); 690f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) result = normedResult; 691f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) resultLen = normalizedLen; 692f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 693f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 694f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Copy the skeleton to the caller's buffer 695f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_SUCCESS(*status)) { 696f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (destCapacity == 0 || resultLen > destCapacity) { 697f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING; 698f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 699f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_memcpy(dest, result, resultLen); 700f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (destCapacity > resultLen) { 701f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) dest[resultLen] = 0; 702f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 703f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_STRING_NOT_TERMINATED_WARNING; 704f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 705f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 706f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 707f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(normedResult); 708f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return resultLen; 709f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 710f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 711f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 712f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 713f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI UnicodeString & U_EXPORT2 714f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, 715f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint32_t type, 716f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString &s, 717f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString &dest, 718f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status) { 719f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 720f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return dest; 721f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 722f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) dest.remove(); 723f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 724f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *str = s.getBuffer(); 725f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t strLen = s.length(); 726f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar smallBuf[USPOOF_STACK_BUFFER_SIZE]; 727f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *buf = smallBuf; 728f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status); 729f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (*status == U_BUFFER_OVERFLOW_ERROR) { 730f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf = static_cast<UChar *>(uprv_malloc((outputSize+1)*sizeof(UChar))); 731f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (buf == NULL) { 732f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_MEMORY_ALLOCATION_ERROR; 733f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return dest; 734f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 735f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ZERO_ERROR; 736f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status); 737f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 738f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_SUCCESS(*status)) { 739f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) dest.setTo(buf, outputSize); 740f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 741f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 742f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (buf != smallBuf) { 743f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(buf); 744f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 745f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return dest; 746f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 747f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 748f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 749f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t U_EXPORT2 750f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_getSkeletonUTF8(const USpoofChecker *sc, 751f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint32_t type, 752f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const char *s, int32_t length, 753f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) char *dest, int32_t destCapacity, 754f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode *status) { 755f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Lacking a UTF-8 normalization API, just converting the input to 756f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // UTF-16 seems as good an approach as any. In typical use, input will 757f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // be an identifier, which is to say not too long for stack buffers. 758f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(*status)) { 759f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 760f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 761f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Buffers for the UChar form of the input and skeleton strings. 762f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar smallInBuf[USPOOF_STACK_BUFFER_SIZE]; 763f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *inBuf = smallInBuf; 764f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar smallOutBuf[USPOOF_STACK_BUFFER_SIZE]; 765f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *outBuf = smallOutBuf; 766f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 767f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t lengthInUChars = 0; 768f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t skelLengthInUChars = 0; 769f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t skelLengthInUTF8 = 0; 770f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 771f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars, 772f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s, length, status); 773f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (*status == U_BUFFER_OVERFLOW_ERROR) { 774f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar))); 775f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (inBuf == NULL) { 776f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_MEMORY_ALLOCATION_ERROR; 777f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) goto cleanup; 778f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 779f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ZERO_ERROR; 780f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars, 781f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) s, length, status); 782f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 783f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 784f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars, 785f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) outBuf, USPOOF_STACK_BUFFER_SIZE, status); 786f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (*status == U_BUFFER_OVERFLOW_ERROR) { 787f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar))); 788f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (outBuf == NULL) { 789f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_MEMORY_ALLOCATION_ERROR; 790f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) goto cleanup; 791f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 792f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_ZERO_ERROR; 793f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars, 794f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) outBuf, skelLengthInUChars+1, status); 795f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 796f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 797f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) u_strToUTF8(dest, destCapacity, &skelLengthInUTF8, 798f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) outBuf, skelLengthInUChars, status); 799f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 800f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) cleanup: 801f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (inBuf != smallInBuf) { 802f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(inBuf); 803f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 804f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (outBuf != smallOutBuf) { 805f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(outBuf); 806f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 807f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return skelLengthInUTF8; 808f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 809f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 810f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 811f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t U_EXPORT2 812f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) { 813f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 814f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (This == NULL) { 815f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U_ASSERT(U_FAILURE(*status)); 816f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 817f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 818f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t dataSize = This->fSpoofData->fRawData->fLength; 819f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (capacity < dataSize) { 820f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_BUFFER_OVERFLOW_ERROR; 821f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return dataSize; 822f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 823f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize); 824f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return dataSize; 825f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 826f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 827f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif 828