zl90 / Cornell-CS4120

I'm taking Cornell University's "Introduction to Compilers" course
3 stars 0 forks source link

Handle unicode escapes within string and character literals #7

Closed zl90 closed 10 months ago

zl90 commented 10 months ago

For example, this string literal: "Hello, Worl\x{64}!\n" will need to replace the unicode code with it's ascii representation, so it should be printed like so: "Hello, World!\n".

Relevant note from the assignment spec:

"String and character literals should support some reasonable set of character escapes, including at least “\”, “\n”, and “\’”. In addition, an escape of the form “\x{HHHHHH}”, where HHHHHH stands for 1–6 hexadecimal digits (upper or lower case), represents the Unicode character with the corresponding code. For example “\x{0a}” is the same as “\n”."

zl90 commented 10 months ago

Had to get ChatGPT to create a custom C function to handle this:

char *handle_unicode_escape_sequences(char *str)
{
    char *result = (char *)malloc(strlen(str) + 1);
    if (!result)
    {
        fprintf(stderr, "Memory allocation failed.\n");
        return "";
    }

    int result_index = 0;
    int i = 0;
    int str_len = strlen(str);

    while (i < str_len)
    {
        if (str[i] == '\\' && str[i + 1] == 'x' && str[i + 2] == '{')
        {
            // Found a potential Unicode escape sequence \x{XXXXXX}
            i += 3; // Move past \x{
            char unicode_hex[7];
            int hex_index = 0;

            // Collect the hexadecimal characters
            while (i < str_len && str[i] != '}' && hex_index < 6)
            {
                unicode_hex[hex_index++] = str[i++];
            }
            unicode_hex[hex_index] = '\0';

            if (i < str_len && str[i] == '}')
            {
                // Valid Unicode escape sequence
                char *end;
                long unicode_value = strtol(unicode_hex, &end, 16);

                if (*end == '\0')
                {
                    // Valid hexadecimal value
                    result[result_index++] = (char)unicode_value;
                    i++; // Move past }
                }
                else
                {
                    // Not a valid hexadecimal value, copy as is
                    result[result_index++] = '\\';
                    result[result_index++] = 'x';
                    for (int j = 0; j < hex_index; j++)
                    {
                        result[result_index++] = unicode_hex[j];
                    }
                }
            }
            else
            {
                // Missing closing curly brace, treat as regular characters
                result[result_index++] = '\\';
                result[result_index++] = 'x';
                for (int j = 0; j < hex_index; j++)
                {
                    result[result_index++] = unicode_hex[j];
                }
            }
        }
        else
        {
            // Copy characters normally
            result[result_index++] = str[i++];
        }
    }

    result[result_index] = '\0';

    return result;
}