StringEscaper.java
/*
* Copyright © 2014 - 2021 Leipzig University (Database Research Group)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gradoop.flink.io.impl.csv.functions;
import com.google.common.collect.BiMap;
import com.google.common.collect.ImmutableBiMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
/**
* Escapes characters in strings and allows to split escaped strings.
*/
public class StringEscaper {
/**
* Escape character.
*/
private static final char ESCAPE_CHARACTER = '\\';
/**
* Custom escape sequences to avoid disruptive behavior of the file reader (e.g. newline).
*/
private static final BiMap<Character, CharSequence> CUSTOM_ESCAPE_SEQUENCES =
new ImmutableBiMap.Builder<Character, CharSequence>()
.put('\t', String.format("%c%c", ESCAPE_CHARACTER, 't'))
.put('\b', String.format("%c%c", ESCAPE_CHARACTER, 'b'))
.put('\n', String.format("%c%c", ESCAPE_CHARACTER, 'n'))
.put('\r', String.format("%c%c", ESCAPE_CHARACTER, 'r'))
.put('\f', String.format("%c%c", ESCAPE_CHARACTER, 'f'))
.build();
/**
* Escapes the {@code escapedCharacters} in a string.
*
* @param string string to be escaped
* @param escapedCharacters characters to be escaped
* @return escaped string
*/
public static String escape(String string, Set<Character> escapedCharacters) {
StringBuilder sb = new StringBuilder();
for (char c : string.toCharArray()) {
if (escapedCharacters.contains(c)) {
sb.append(escapeCharacter(c));
} else {
sb.append(c);
}
}
return sb.toString();
}
/**
* Unescapes the escaped characters in a string.
*
* @param escapedString string to be unescaped
* @return unescaped string
*/
public static String unescape(String escapedString) {
StringBuilder sb = new StringBuilder();
boolean escaped = false;
for (int i = 0; i < escapedString.length(); i++) {
if (escaped) {
escaped = false;
sb.append(unescapeSequence(escapedString.subSequence(i - 1, i + 1)));
} else if (escapedString.charAt(i) == ESCAPE_CHARACTER) {
escaped = true;
} else {
sb.append(escapedString.charAt(i));
}
}
return sb.toString();
}
/**
* Splits an escaped string while ignoring escaped delimiters. Does not unescape the tokens.
*
* @param escapedString escaped string to be split
* @param delimiter delimiter string
* @return string array with still escaped strings split by the delimiter
* @throws IllegalArgumentException if the delimiter contains the escape character
*/
public static String[] split(String escapedString, String delimiter)
throws IllegalArgumentException {
return split(escapedString, delimiter, 0);
}
/**
* Splits an escaped string while ignoring escaped delimiters. Does not unescape the tokens.
*
* @param escapedString escaped string to be split
* @param delimiter delimiter string
* @param limit limits the size of the output
* @return string array with still escaped strings split by the delimiter
* @throws IllegalArgumentException if the delimiter contains the escape character
*/
public static String[] split(String escapedString, String delimiter, int limit)
throws IllegalArgumentException {
if (delimiter.contains(Character.toString(ESCAPE_CHARACTER))) {
throw new IllegalArgumentException(String.format(
"Delimiter must not contain the escape character: '%c'", ESCAPE_CHARACTER));
}
if (limit <= 0) {
limit = escapedString.length() + 1;
}
List<String> tokens = new ArrayList<>();
StringBuilder sb = new StringBuilder();
boolean escaped = false;
int delimiterIndex = 0;
for (char c : escapedString.toCharArray()) {
// parse and match delimiter
if (!escaped && c == delimiter.charAt(delimiterIndex)) {
delimiterIndex++;
if (delimiterIndex == delimiter.length()) {
if (tokens.size() < limit - 1) {
tokens.add(sb.toString());
sb.setLength(0);
} else {
sb.append(delimiter, 0, delimiterIndex);
}
delimiterIndex = 0;
}
} else {
// reset delimiter parsing
sb.append(delimiter, 0, delimiterIndex);
delimiterIndex = 0;
// escape
if (escaped) {
escaped = false;
} else if (c == ESCAPE_CHARACTER) {
escaped = true;
}
sb.append(c);
}
}
sb.append(delimiter, 0, delimiterIndex);
tokens.add(sb.toString());
return tokens.toArray(new String[0]);
}
/**
* Returns the escape sequence of a given character.
*
* @param character character to be escaped
* @return escape sequence
*/
private static CharSequence escapeCharacter(char character) {
if (CUSTOM_ESCAPE_SEQUENCES.containsKey(character)) {
return CUSTOM_ESCAPE_SEQUENCES.get(character);
}
return String.format("%c%c", ESCAPE_CHARACTER, character);
}
/**
* Returns the character of a given escape sequence.
*
* @param sequence escape sequence
* @return escaped character
*/
private static char unescapeSequence(CharSequence sequence) {
if (CUSTOM_ESCAPE_SEQUENCES.containsValue(sequence)) {
return CUSTOM_ESCAPE_SEQUENCES.inverse().get(sequence);
}
return sequence.charAt(1);
}
}