Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,20 @@ public final void parseVersion(String version) {
int startIndex = 0;

for (int i = 0; i < version.length(); i++) {
char c = version.charAt(i);
char character = version.charAt(i);
int c = character;
if (Character.isHighSurrogate(character)) {
// read the next character as a low surrogate and combine into a single int
try {
char low = version.charAt(i + 1);
char[] both = {character, low};
c = Character.codePointAt(both, 0);
i++;
} catch (IndexOutOfBoundsException ex) {
// high surrogate without low surrogate. Not a lot we can do here except treat it as a regular
// character
}
}

if (c == '.') {
if (i == startIndex) {
Expand Down Expand Up @@ -687,7 +700,8 @@ public final void parseVersion(String version) {
stack.push(list);
}
isCombination = false;
} else if (Character.isDigit(c)) {
// TODO we might not want to use isDigit here; just check for ASCII digits only
} else if (c >= '0' && c <= '9') {
Copy link
Contributor

@gnodet gnodet Jan 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why not. It seems to me that later, the string will be parsed using Integer.parseInt, Long.parseLong, or new BigInteger(), and AFAIK, they all leverage the Character.digit(char, int) which should support everything that is considered a digit by Character.isDigit(char) (but not Character.isDigit(int)).
I think this has the same effect as we're only considering radix-10 when parsing.
An alternative to support supplemental characters would be to normalise those in a buffer while they are read in this loop and pass on the buffer rather than a substring.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like the following maybe ?

@SuppressWarnings("checkstyle:innerassignment")
public final void parseVersion(String version) {
    this.value = version;

    items = new ListItem();

    version = version.toLowerCase(Locale.ENGLISH);

    ListItem list = items;

    Deque<Item> stack = new ArrayDeque<>();
    stack.push(list);

    boolean isDigit = false;

    boolean isCombination = false;

    int startIndex = 0;

    StringBuilder normalizedBuffer = new StringBuilder();

    for (int i = 0; i < version.length(); i++) {
        char character = version.charAt(i);
        int c = character;

        // Handle high surrogate
        if (Character.isHighSurrogate(character)) {
            try {
                char low = version.charAt(i + 1);
                char[] both = {character, low};
                c = Character.codePointAt(both, 0);
                i++;
            } catch (IndexOutOfBoundsException ex) {
                // Treat high surrogate without low surrogate as a regular character
            }
        }

        // Normalize the Unicode digits (including supplemental plane characters) using Character.digit()
        if (Character.isDigit(c)) {
            // Normalize Unicode digits to ASCII digits
            int normalizedDigit = Character.digit(c, 10);
            if (normalizedDigit != -1) {
                c = normalizedDigit + '0'; // Convert digit to its ASCII equivalent
            }
        }

        // Build the normalized substring on the fly
        if (c == '.' || c == '-' || i == version.length() - 1) {
            // If we've reached a separator or the end of the string, process the current segment
            if (i == startIndex && c != '-') {
                list.add(IntItem.ZERO);  // Handle empty sections
            } else {
                // Add the current segment to the list after normalizing it
                list.add(parseItem(isCombination, isDigit, normalizedBuffer.toString()));
            }
            isCombination = false;
            normalizedBuffer.setLength(0);  // Reset the buffer for the next segment
            startIndex = i + 1;

            // Handle separators
            if (c == '-' && !list.isEmpty()) {
                list.add(list = new ListItem());
                stack.push(list);
            }
        } else if (c >= '0' && c <= '9') {
            if (!isDigit && i > startIndex) {
                // Handle combination of a letter and number, like "X1"
                isCombination = true;
                if (!list.isEmpty()) {
                    list.add(list = new ListItem());
                    stack.push(list);
                }
            }
            isDigit = true;
        } else {
            if (isDigit && i > startIndex) {
                list.add(parseItem(isCombination, true, normalizedBuffer.toString()));
                normalizedBuffer.setLength(0);  // Reset buffer
                startIndex = i;

                list.add(list = new ListItem());
                stack.push(list);
                isCombination = false;
            }

            isDigit = false;
        }

        // Append the current character to the normalized buffer
        normalizedBuffer.append((char) c);
    }

    if (version.length() > startIndex) {
        // Final segment (if any)
        if (!isDigit && !list.isEmpty()) {
            list.add(list = new ListItem());
            stack.push(list);
        }

        list.add(parseItem(isCombination, isDigit, normalizedBuffer.toString()));
    }

    while (!stack.isEmpty()) {
        list = (ListItem) stack.pop();
        list.normalize();
    }
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, just to be clear you prefer the alternate approach of treating all Unicode digits as version numbers? As long as they're all base 10 this should work. Looking at Unicode details, there are some non-base 10 digits. They have a different Unicode character class, so I need to check what Character.isDigit is doing; i..e whether it's checking for decimal digits or all digits. If it is checking for all digits, I need to check the character class directly. To be specific we want De, possibly Di, but not Nu character classes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, looks like Character.isDigit is checking specifically for decimal digits, not non-decimal digits

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ran into a problem with that approach. It treats different versions of digits as the same. E.g. 7 is the same as Arabic-Indic digit 7 even though they are different characters.

I can see an argument for normalizing all these digits to ASCII digits for Maven purposes, but that would touch a lot of different places in the code and ecosystem including things like file names and artifact URLs in Maven Central.
I'm going to think about this a little more, but for the moment it feels safer to treat all the non-ASCII digits as distinct non-numeric characters.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only supporting ASCII digits sounds fine to me too.

if (!isDigit && i > startIndex) {
// X1
isCombination = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,33 @@ void testLeadingZeroes() {
checkVersionsOrder("0.2", "1.0.7");
}

@Test
void testDigitGreaterThanNonAscii() {
ComparableVersion c1 = new ComparableVersion("1");
ComparableVersion c2 = new ComparableVersion("é");
assertTrue(c1.compareTo(c2) > 0, "expected " + "1" + " > " + "\uD835\uDFE4");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The messages is wrong

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

assertTrue(c2.compareTo(c1) < 0, "expected " + "\uD835\uDFE4" + " < " + "1");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and that one too

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

}

@Test
void testDigitGreaterThanNonBmpCharacters() {
ComparableVersion c1 = new ComparableVersion("1");
// MATHEMATICAL SANS-SERIF DIGIT TWO
ComparableVersion c2 = new ComparableVersion("\uD835\uDFE4");
assertTrue(c1.compareTo(c2) > 0, "expected " + "1" + " > " + "\uD835\uDFE4");
assertTrue(c2.compareTo(c1) < 0, "expected " + "\uD835\uDFE4" + " < " + "1");
}

@Test
void testNonAsciiDigits() {
// These should be treated the same as non-digit characters
ComparableVersion c1 = new ComparableVersion("1");
// ArabicIndicNine
ComparableVersion c2 = new ComparableVersion("\u0669");
assertTrue(c1.compareTo(c2) > 0, "expected " + "1" + " > " + "\u0669");
assertTrue(c2.compareTo(c1) < 0, "expected " + "\u0669" + " < " + "1");
}

@Test
void testGetCanonical() {
// MNG-7700
Expand Down