Commit 9aaa754d authored by Jesper Zedlitz's avatar Jesper Zedlitz

Verbesserte Behandlung von GEDCOM-Dateien mit Fehlern:

 - fehlende BOM bei UTF-16 Dateien
 - übersprungene Level
 - ungültige Zeilen
parent bf8bc37a
Pipeline #1368 passed with stage
in 43 seconds
......@@ -26,6 +26,7 @@ import java.io.*;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
/**
* @author Jesper Zedlitz <j.zedlitz@email.uni-kiel.de>
......@@ -34,6 +35,10 @@ public class GedcomReader {
private static CharsetDecoder anselDecoder = new AnselCharset().newDecoder();
public static Node read(File file) throws IOException {
return read(file, false);
}
public static Node read(File file, boolean strict) throws IOException {
String characterEncoding = "latin1";
boolean charsetFound = false;
boolean isAnsel = false;
......@@ -45,8 +50,16 @@ public class GedcomReader {
fis.close();
BufferedReader gedcomFile;
if (((bom[0] & 0xff) == 0xff && (bom[1] & 0xff) == 0xfe) || ((bom[0] & 0xff) == 0xfe && (bom[1] & 0xff) == 0xff)) {
gedcomFile = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-16"));
if (((bom[0] & 0xff) == 0xfe && (bom[1] & 0xff) == 0xff) // the official UTF-16 BE BOM
|| ((bom[0] & 0xff) == 0 && (bom[1] & 0xff) == '0') // some UTF-16 GEDCOM files come without the BOM
) {
gedcomFile = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_16BE));
characterEncoding = "UTF-16";
charsetFound = true;
} else if (((bom[0] & 0xff) == 0xff && (bom[1] & 0xff) == 0xfe) // the official UTF-16 BE BOM
|| ((bom[0] & 0xff) == '0' && (bom[1] & 0xff) == 0) // some UTF-16 GEDCOM files come without the BOM
) {
gedcomFile = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_16LE));
characterEncoding = "UTF-16";
charsetFound = true;
} else {
......@@ -95,6 +108,7 @@ public class GedcomReader {
while (line != null) {
lineNumber++;
line = StringUtils.trim(line);
boolean skipLine = false;
int level = NumberUtils.toInt(StringUtils.substringBefore(line, " "));
String tag;
......@@ -122,7 +136,11 @@ public class GedcomReader {
} else {
// Level has to be greater zero.
if (level == 0) {
throw new GedcomException("Invalid GEDCOM line", lineNumber, line );
if (strict) {
throw new GedcomException("Invalid GEDCOM line", lineNumber, line);
} else {
skipLine = true;
}
}
tag = StringUtils.trim(StringUtils.substringBefore(StringUtils.substringAfter(line, " "), " "));
......@@ -137,43 +155,53 @@ public class GedcomReader {
}
if (tag.contains(" ")) {
throw new GedcomException("Invalid GEDCOM line", lineNumber, line );
if (strict) {
throw new GedcomException("Invalid GEDCOM line", lineNumber, line);
} else {
skipLine = true;
}
}
// ========== A single line has been read. Now construct the tree. ==========
if (!skipLine) {
// The two special cases CONT and CONC that do not need new nodes.
if ("CONT".equals(tag)) {
currentNode.setValue(currentNode.getValue() + "\n" + value);
} else if ("CONC".equals(tag)) {
currentNode.setValue(currentNode.getValue() + value);
} else {
Node node = new Node(level, WordUtils.capitalize(tag));
int lastLevel = currentNode.getLevel();
if (level > lastLevel + 1) {
throw new GedcomException("Skipped level", lineNumber, line);
}
if (level > lastLevel) {
// a child node
currentNode.addChild(node);
} else if (level < lastLevel) {
// climb the tree up
for (int i = level; i <= lastLevel; i++) {
currentNode = currentNode.getParent();
}
currentNode.addChild(node);
// The two special cases CONT and CONC that do not need new nodes.
if ("CONT".equals(tag)) {
currentNode.setValue(currentNode.getValue() + "\n" + value);
} else if ("CONC".equals(tag)) {
currentNode.setValue(currentNode.getValue() + value);
} else {
// a sibling node
currentNode.getParent().addChild(node);
}
int lastLevel = currentNode.getLevel();
if (level > lastLevel + 1) {
if (strict) {
throw new GedcomException("Skipped level", lineNumber, line);
} else {
level = lastLevel + 1;
}
}
Node node = new Node(level, WordUtils.capitalize(tag));
if (level > lastLevel) {
// a child node
currentNode.addChild(node);
} else if (level < lastLevel) {
// climb the tree up
for (int i = level; i <= lastLevel; i++) {
currentNode = currentNode.getParent();
}
currentNode.addChild(node);
} else {
// a sibling node
currentNode.getParent().addChild(node);
}
node.setXref(xref);
node.setPointer(pointer);
node.setValue(value);
node.setXref(xref);
node.setPointer(pointer);
node.setValue(value);
currentNode = node;
currentNode = node;
}
}
line = gedcomFile.readLine();
......
......@@ -20,6 +20,7 @@ package de.zedlitz.gedcom;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
* @author Jesper Zedlitz <j.zedlitz@email.uni-kiel.de>
......@@ -136,6 +137,28 @@ public class Node {
return children;
}
/**
* Returns all child nodes with the specified tag.
*/
public List<Node> getChildren(String tag) {
return children.stream().filter( c-> tag.equalsIgnoreCase(c.tag)).collect(Collectors.toList());
}
/**
* Returns the first child node with the specified tag.
*/
public Node getFirstChild(String tag) {
return children.stream().filter( c-> tag.equalsIgnoreCase(c.tag)).findFirst().orElse(null);
}
/**
* Returns the value of the first child node with the speicifed tag or <code>null</code> if no such child exists.
*/
public String getFirstChildValue(String tag) {
Node n = getFirstChild(tag);
return n == null ? null: n.getValue();
}
public void addChild(Node child) {
this.children.add(child);
child.setParent(this);
......
......@@ -119,14 +119,27 @@ public class GedcomReaderTest {
public void testRead_skippedLevel() throws IOException {
final File gedcomFile = new File(GedcomReaderTest.class.getResource("/err-skip.ged").getFile());
GedcomReader.read(gedcomFile);
}
/**
* This GEDCOM file contains an invalid skipped level: a level 3 tag is child of a level 1 tag.
* Parsing the file with setting <code>strict=true</code> will raise an exception.
*/
@Test
public void testRead_skippedLevel_strict() throws IOException {
final File gedcomFile = new File(GedcomReaderTest.class.getResource("/err-skip.ged").getFile());
try {
GedcomReader.read(gedcomFile);
GedcomReader.read(gedcomFile, true);
fail();
} catch (GedcomException e) {
assertTrue(e.getMessage().contains("Skipped"));
}
}
/**
* This GEDCOM file contains an invalid line.
*/
......@@ -134,18 +147,38 @@ public class GedcomReaderTest {
public void testRead_invalidLine() throws IOException {
final File gedcomFile = new File(GedcomReaderTest.class.getResource("/err-invalid-line.ged").getFile());
GedcomReader.read(gedcomFile);
}
/**
* This GEDCOM file contains an invalid line.
* Parsing the file with setting <code>strict=true</code> will raise an exception.
*/
@Test
public void testRead_invalidLine_strict() throws IOException {
final File gedcomFile = new File(GedcomReaderTest.class.getResource("/err-invalid-line.ged").getFile());
try {
GedcomReader.read(gedcomFile);
GedcomReader.read(gedcomFile, true);
fail();
} catch (GedcomException e) {
}
}
@Test
public void testRead_level0NoXref() throws IOException {
final File gedcomFile = new File(GedcomReaderTest.class.getResource("/level0.ged").getFile());
GedcomReader.read(gedcomFile);
}
@Test
public void testRead_utf16le() throws IOException {
final File gedcomFile = new File(GedcomReaderTest.class.getResource("/utf16le.ged").getFile());
final Node result = GedcomReader.read(gedcomFile);
assertNotNull(result);
assertNotNull(result.getChildren());
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment