/*******************************************************************************
;*******************************************************************************
;**                                                                           **
;**                    COPYRIGHT 2004-2012 NUANCE COMMUNICATIONS              **
;**                                                                           **
;**               NUANCE COMMUNICATIONS PROPRIETARY INFORMATION               **
;**                                                                           **
;**     This software is supplied under the terms of a license agreement      **
;**     or non-disclosure agreement with Nuance Communications and may not    **
;**     be copied or disclosed except in accordance with the terms of that    **
;**     agreement.                                                            **
;**                                                                           **
;**                                                                           **
;*******************************************************************************
;**                                                                           **
;**     FileName: et9cpsegment.c                                              **
;**                                                                           **
;**  Description: Chinese XT9 phrasal segmentation module.                    **
;**               Conforming to the development version of Chinese XT9.       **
;**                                                                           **
;*******************************************************************************
;******************************************************************************/

#include "et9api.h"
#include "et9cpsys.h"
#include "et9cpldb.h"
#include "et9cprdb.h"
#include "et9cpwdls.h"
#include "et9misc.h"

#define ET9_CP_MAX_SEGMENT_LENGTH           ET9_CP_MAX_LDB_PHRASE_SIZE /* < -IDR-                                                          */
#define ET9_CP_MAX_SEGMENT_INPUT_LENGTH     256 /* < -IDR-                                                        */

/** Segment a Unicode string
 *
 *  This function is useful to decompose the input string into logical phrases.
 *
 * @param pET9CPLingInfo        pointer to chinese information structure.
 * @param pUnicodeStr           The Unicode string to be segmented.
 * @param wStrLen               The number of Unicode characters in pUnicodeStr. Maximum allowed size is defined as ET9_CP_MAX_SEGMENT_INPUT_LENGTH.
 * @param pSegmentedStr         (in/out) Buffer to hold the segmented Unicode string. Segments are seperated by ET9CP_SEGMENT_DELIMITER.
 * @param pwSegmentedStrLen     IN: hold the size of pSegmentedStr,  OUT: hold the character count of the segmented Unicode string.
 *
 * @return ET9STATUS_NONE               Succeeded
 * @return ET9STATUS_NO_INIT            pET9CPLingInfo is not initialized
 * @return ET9STATUS_BAD_PARAM          some pointer argument is NULL or buffer size is 0 or buffer size is not big enough to hold the segmented Unicode string.
 */
ET9STATUS ET9FARCALL ET9CPSegmentPhrase(ET9CPLingInfo          *pET9CPLingInfo,
                                        const ET9SYMB          *pUnicodeStr,
                                        const ET9U16            wStrLen,
                                        ET9SYMB                *pSegmentedStr,
                                        ET9U16                 *pwSegmentedStrLen)
{
    ET9STATUS eStatus = ET9STATUS_ERROR;
    ET9UINT m, n;
    ET9U16 wSegmentCount;
    ET9U16 wCurrentSegmentLen;     
    ET9INT asnFreqs[ET9_CP_MAX_LDB_PHRASE_SIZE];
    ET9U16 wSegmentedStrSize;
    ET9BOOL bJustFoundInvalidSymb = 0;
    ET9U16 wOutputSegmentedStrLen = 0;

    ET9_CP_CHECK_LINGINFO(pET9CPLingInfo);
    
    if (pUnicodeStr == NULL || wStrLen < 1) {
        return ET9STATUS_BAD_PARAM;
    }

    if (pSegmentedStr == NULL || pwSegmentedStrLen == NULL || *pwSegmentedStrLen < wStrLen || wStrLen > ET9_CP_MAX_SEGMENT_INPUT_LENGTH) {
        return ET9STATUS_BAD_PARAM;
    }
    
    wSegmentedStrSize = *pwSegmentedStrLen;

    n = 0;  /* Current index of input string */
    wSegmentCount = 0;
    wOutputSegmentedStrLen = 0;
    bJustFoundInvalidSymb = 0;
    wCurrentSegmentLen = wStrLen > ET9_CP_MAX_SEGMENT_LENGTH ? ET9_CP_MAX_SEGMENT_LENGTH : wStrLen;
    while(wCurrentSegmentLen > 0 && n + wCurrentSegmentLen <= wStrLen) 
    {   /* Walk input string from beginning a chunk at a time. Chunk size is at most the maximum segment length */
        ET9BOOL bFound = 0;
        ET9BOOL bSkipSegmentDelimiter = 0;

        /* LDB */
        if (wCurrentSegmentLen <= ET9_CP_MAX_SEGMENT_LENGTH) 
        {
            eStatus = ET9_CP_GetSubLdbPhraseFreqs(pET9CPLingInfo, pUnicodeStr + n, (ET9U8) wCurrentSegmentLen, asnFreqs, ET9_CP_MAX_LDB_PHRASE_SIZE);

            /* Found LDB Phrase */
            if (eStatus == ET9STATUS_NONE) 
            {
                bJustFoundInvalidSymb = 0;

                /* Find shortest multi-char phrase in current segment */
                for (m = 1; m < wCurrentSegmentLen; m++) {
                    if (asnFreqs[m] >= 0) {  /* Found phrase */                    
                        bFound = 1;
                        break;
                    }
                }
                if (bFound == 0) {
                    m = 0; /* Single character */
                }
                wCurrentSegmentLen = (ET9U16) (m + 1);
                bFound = 1;
            }
            /* Non-Chinese characters: Group consecutive non-Chinese characters together */
            else if (eStatus == ET9STATUS_WORD_NOT_FOUND)
            {
                wCurrentSegmentLen = 1;

                /* Found non-Chinese character AGAIN:
                   Skip segment delimiter to form consecutive non-Chinese characters segment */
                if (bJustFoundInvalidSymb) {
                    bSkipSegmentDelimiter = 1;
                }
                bJustFoundInvalidSymb = 1;
                bFound = 1;
                eStatus = ET9STATUS_NONE; /* Non-Chinese characters in a segment still equal success */
            }
            /* Error */
            else 
            {
                return eStatus; 
            }
        }

        if (bFound || bJustFoundInvalidSymb) /* Found a new segment or a non-Chinese symbol */
        {
            ET9BOOL fNeedDelimiter = (!bSkipSegmentDelimiter && wSegmentCount > 0) ? 1 : 0;
            
            if (wCurrentSegmentLen + wOutputSegmentedStrLen + (fNeedDelimiter ? 1 : 0) > wSegmentedStrSize) {
                return ET9STATUS_BAD_PARAM;
            }

            /* Add the segment delimiter for previous segment */            
            if (fNeedDelimiter) {
                ET9Assert(n > 0);
                pSegmentedStr[wOutputSegmentedStrLen] = ET9CP_SEGMENT_DELIMITER;
                wOutputSegmentedStrLen += 1;
            }

            for (m = 0; m < wCurrentSegmentLen; m++) {
                pSegmentedStr[wOutputSegmentedStrLen] = pUnicodeStr[n + m];
                wOutputSegmentedStrLen += 1;
            }

            /* Increment segment count only when a new segment delimiter is added */
            if (!bSkipSegmentDelimiter) {
                wSegmentCount++;
            }

            n += wCurrentSegmentLen;
            wCurrentSegmentLen = (wStrLen - n) < ET9_CP_MAX_SEGMENT_LENGTH ? (ET9U8) (wStrLen - n) : ET9_CP_MAX_SEGMENT_LENGTH;
        }
        else {
            /* Need to back down 1 character */
            --wCurrentSegmentLen;
        }
    } /* Loop input string */

    ET9Assert(wSegmentCount > 0);
    ET9Assert(n == wStrLen);
    ET9Assert(wOutputSegmentedStrLen == wStrLen + wSegmentCount - 1);

    if (ET9STATUS_NONE == eStatus) {
        *pwSegmentedStrLen = wOutputSegmentedStrLen;
    }

    return eStatus;
}