Browse Source

feat: Large scale import/export functionality

Matthias Ladkau 2 years ago
parent
commit
3a2e92a4eb
4 changed files with 336 additions and 4 deletions
  1. 73 0
      graph/graphmanager_test.go
  2. 170 4
      graph/import_export.go
  3. 53 0
      graph/import_export_test.go
  4. 40 0
      importexport.md

+ 73 - 0
graph/graphmanager_test.go

@@ -17,6 +17,7 @@ import (
 	"testing"
 
 	"devt.de/krotik/common/fileutil"
+	"devt.de/krotik/eliasdb/graph/data"
 	"devt.de/krotik/eliasdb/graph/graphstorage"
 )
 
@@ -75,3 +76,75 @@ NewGraphManager returns a new GraphManager instance without loading rules.
 func newGraphManagerNoRules(gs graphstorage.Storage) *Manager {
 	return createGraphManager(gs)
 }
+
+/*
+Create a GraphManager which has some prefilled data.
+*/
+func songGraph() (*Manager, *graphstorage.MemoryGraphStorage) {
+
+	mgs := graphstorage.NewMemoryGraphStorage("mystorage")
+	gm := NewGraphManager(mgs)
+
+	constructEdge := func(key string, node1 data.Node, node2 data.Node, number int) data.Edge {
+		edge := data.NewGraphEdge()
+
+		edge.SetAttr("key", key)
+		edge.SetAttr("kind", "Wrote")
+
+		edge.SetAttr(data.EdgeEnd1Key, node1.Key())
+		edge.SetAttr(data.EdgeEnd1Kind, node1.Kind())
+		edge.SetAttr(data.EdgeEnd1Role, "Author")
+		edge.SetAttr(data.EdgeEnd1Cascading, true)
+
+		edge.SetAttr(data.EdgeEnd2Key, node2.Key())
+		edge.SetAttr(data.EdgeEnd2Kind, node2.Kind())
+		edge.SetAttr(data.EdgeEnd2Role, "Song")
+		edge.SetAttr(data.EdgeEnd2Cascading, false)
+
+		edge.SetAttr("number", number)
+
+		return edge
+	}
+
+	storeSong := func(node data.Node, name string, ranking int, number int) {
+		node3 := data.NewGraphNode()
+		node3.SetAttr("key", name)
+		node3.SetAttr("kind", "Song")
+		node3.SetAttr("name", name)
+		node3.SetAttr("ranking", ranking)
+		gm.StoreNode("main", node3)
+		gm.StoreEdge("main", constructEdge(name+"-edge", node, node3, number))
+	}
+
+	node0 := data.NewGraphNode()
+	node0.SetAttr("key", "000")
+	node0.SetAttr("kind", "Author")
+	node0.SetAttr("name", "John")
+	gm.StoreNode("main", node0)
+
+	storeSong(node0, "Aria1", 8, 1)
+	storeSong(node0, "Aria2", 2, 2)
+	storeSong(node0, "Aria3", 4, 3)
+	storeSong(node0, "Aria4", 18, 4)
+
+	node1 := data.NewGraphNode()
+	node1.SetAttr("key", "123")
+	node1.SetAttr("kind", "Author")
+	node1.SetAttr("name", "Mike")
+	gm.StoreNode("main", node1)
+
+	storeSong(node1, "LoveSong3", 1, 3)
+	storeSong(node1, "FightSong4", 3, 4)
+	storeSong(node1, "DeadSong2", 6, 2)
+	storeSong(node1, "StrangeSong1", 5, 1)
+
+	node2 := data.NewGraphNode()
+	node2.SetAttr("key", "456")
+	node2.SetAttr("kind", "Author")
+	node2.SetAttr("name", "Hans")
+	gm.StoreNode("main", node2)
+
+	storeSong(node2, "MyOnlySong3", 19, 3)
+
+	return gm, mgs.(*graphstorage.MemoryGraphStorage)
+}

+ 170 - 4
graph/import_export.go

@@ -11,21 +11,24 @@
 package graph
 
 import (
+	"bufio"
 	"bytes"
 	"encoding/json"
 	"fmt"
 	"io"
+	"strings"
 
 	"devt.de/krotik/common/errorutil"
 	"devt.de/krotik/eliasdb/graph/data"
+	"devt.de/krotik/eliasdb/hash"
 )
 
 /*
 ExportPartition dumps the contents of a partition to an io.Writer in JSON format:
 
 	{
-		nodes : [ { <attr> : <value> }, ... ]
-		edges : [ { <attr> : <value> }, ... ]
+		nodes : [ { <attr> : <value>, ... }, ... ]
+		edges : [ { <attr> : <value>, ... }, ... ]
 	}
 */
 func ExportPartition(out io.Writer, part string, gm *Manager) error {
@@ -207,8 +210,8 @@ ImportPartition imports the JSON contents of an io.Reader into a given partition
 The following format is expected:
 
 	{
-		nodes : [ { <attr> : <value> }, ... ]
-		edges : [ { <attr> : <value> }, ... ]
+		nodes : [ { <attr> : <value>, ... }, ... ]
+		edges : [ { <attr> : <value>, ... }, ... ]
 	}
 */
 func ImportPartition(in io.Reader, part string, gm *Manager) error {
@@ -251,3 +254,166 @@ func ImportPartition(in io.Reader, part string, gm *Manager) error {
 
 	return trans.Commit()
 }
+
+/*
+ExportWriterFactory produces a named writer.
+*/
+type ExportWriterFactory interface {
+	CreateWriter(name string) (io.Writer, error)
+}
+
+/*
+LargeScaleExportPartition dumps the contents of a partition into multiple io.Writer in line-delimited JSON format:
+
+{ <attr> : <value>, ... },
+{ <attr> : <value>, ... },
+...
+*/
+func LargeScaleExportPartition(ewf ExportWriterFactory, part string, gm *Manager) error {
+	out, err := ewf.CreateWriter(fmt.Sprintf("%v-nodes", part))
+
+	if err == nil {
+		for _, k := range gm.NodeKinds() {
+			var it *NodeKeyIterator
+
+			if it, err = gm.NodeKeyIterator(part, k); err == nil {
+
+				for it.HasNext() {
+					key := it.Next()
+
+					if err = it.Error(); err == nil {
+						var node data.Node
+
+						if node, err = gm.FetchNode(part, key, k); err == nil {
+							var jsonBytes []byte
+
+							if jsonBytes, err = json.Marshal(node.Data()); err == nil {
+								_, err = out.Write(jsonBytes)
+								fmt.Fprintln(out)
+							}
+						}
+					}
+
+					if err != nil {
+						break
+					}
+				}
+			}
+
+			if err != nil {
+				break
+			}
+		}
+	}
+
+	out, err = ewf.CreateWriter(fmt.Sprintf("%v-edges", part))
+
+	if err == nil {
+		for _, k := range gm.EdgeKinds() {
+			var tree *hash.HTree
+
+			if tree, err = gm.getEdgeStorageHTree(part, k, false); err == nil {
+				gm.mutex.RLock()
+				it := hash.NewHTreeIterator(tree)
+				gm.mutex.RUnlock()
+
+				if err = it.LastError; err == nil {
+
+					for it.HasNext() {
+						gm.mutex.RLock()
+						binaryKey, _ := it.Next()
+						gm.mutex.RUnlock()
+
+						if prefix := binaryKey[:len(PrefixNSAttrs)]; string(prefix) != PrefixNSAttrs {
+							continue
+						}
+
+						key := string(binaryKey[len(PrefixNSAttrs):])
+
+						if err = it.LastError; err == nil {
+							var node data.Node
+
+							if node, err = gm.FetchEdge(part, key, k); err == nil {
+								var jsonBytes []byte
+
+								if jsonBytes, err = json.Marshal(node.Data()); err == nil {
+									_, err = out.Write(jsonBytes)
+									fmt.Fprintln(out)
+								}
+							}
+						}
+
+						if err != nil {
+							break
+						}
+					}
+				}
+			}
+
+			if err != nil {
+				break
+			}
+
+		}
+	}
+
+	return err
+}
+
+/*
+ImportReaderFactory produces a named reader.
+*/
+type ImportReaderFactory interface {
+	Readers() []string
+	CreateReader(name string) (io.Reader, error)
+}
+
+/*
+LargeScaleImportPartition dumps the contents of a partition into multiple io.Writer in line-delimited JSON format:
+
+{ <attr> : <value>, ... },
+{ <attr> : <value>, ... },
+...
+*/
+func LargeScaleImportPartition(irf ImportReaderFactory, part string, gm *Manager) error {
+	var err error
+
+	readers := irf.Readers()
+
+	trans := NewRollingTrans(NewGraphTrans(gm), 1000, gm, NewGraphTrans)
+
+	for _, r := range readers {
+		var in io.Reader
+
+		if in, err = irf.CreateReader(r); err == nil {
+			isNode := strings.HasSuffix(r, "-nodes")
+
+			scanner := bufio.NewScanner(in)
+			for scanner.Scan() {
+				var nodeData map[string]interface{}
+
+				if err = json.Unmarshal(scanner.Bytes(), &nodeData); err == nil {
+					if isNode {
+						err = gm.StoreNode(part, data.NewGraphNodeFromMap(nodeData))
+					} else {
+						err = gm.StoreEdge(part, data.NewGraphEdgeFromNode(data.NewGraphNodeFromMap(nodeData)))
+					}
+				}
+
+				if err != nil {
+					break
+				}
+			}
+		}
+
+		if err != nil {
+			break
+		}
+	}
+
+	if err == nil {
+		err = trans.Commit()
+	}
+
+	return err
+}

+ 53 - 0
graph/import_export_test.go

@@ -12,6 +12,8 @@ package graph
 
 import (
 	"bytes"
+	"fmt"
+	"io"
 	"strings"
 	"testing"
 
@@ -398,3 +400,54 @@ func TestImportExportError(t *testing.T) {
 	}
 
 }
+
+type testFactory struct {
+	readers []string
+	buf     map[string]*bytes.Buffer
+}
+
+func (tf *testFactory) CreateWriter(name string) (io.Writer, error) {
+	var b bytes.Buffer
+
+	tf.readers = append(tf.readers, name)
+	tf.buf[name] = &b
+
+	return &b, nil
+}
+
+func (tf *testFactory) Readers() []string {
+	return tf.readers
+}
+
+func (tf *testFactory) CreateReader(name string) (io.Reader, error) {
+	return tf.buf[name], nil
+}
+
+func TestScaleExport(t *testing.T) {
+	gm, _ := songGraph()
+
+	tf := &testFactory{make([]string, 0), make(map[string]*bytes.Buffer)}
+
+	LargeScaleExportPartition(tf, "main", gm)
+
+	var out1 bytes.Buffer
+
+	ExportPartition(&out1, "main", gm)
+	res1 := SortDump(out1.String())
+
+	fmt.Println(res1)
+
+	mgs2 := graphstorage.NewMemoryGraphStorage("mystorage2")
+	gm2 := NewGraphManager(mgs2)
+
+	LargeScaleImportPartition(tf, "main", gm2)
+
+	var out2 bytes.Buffer
+
+	ExportPartition(&out2, "main", gm2)
+	res2 := SortDump(out2.String())
+
+	fmt.Println(res2)
+
+	fmt.Println("-->", res1 == res2)
+}

+ 40 - 0
importexport.md

@@ -0,0 +1,40 @@
+EliasDB Import/Export
+=====================
+
+EliasDB supports importing and exporting of data in various ways:
+- By [embedding](embedding.md) EliasDB in another Go project.
+- By using the [REST API](http://petstore.swagger.io/?url=https://devt.de/krotik/eliasdb/raw/master/swagger.json) interface.
+- By running an [ECAL](http://petstore.swagger.io/?url=https://devt.de/krotik/eliasdb/raw/master/swagger.json) script.
+- By running the `EliasDB` executable with import/export parameters in the CLI.
+
+Bulk importing and exporting is best done through the last option.
+
+Bulk importing and exporting via the CLI
+--
+Bulk import/export through the CLI is available using the `eliasdb` binary with the `server` command. In general there are two different types of import/export modes:
+- Normal import/export through a single compact ZIP file.
+- Large scale import/export though multiple ZIP files.
+
+Parameter|Description
+-|-
+-export|Export the current DB into a ZIP file. The data of each partition is stored into a separate file as a JSON object.
+-import|Import into the current DB from a ZIP file. The data is expected in the same format as in the `-export` case.
+-export-ls|Export the current DB into multiple ZIP file. The data of each partition is stored into two separate files for nodes and edges in a line-delimited JSON format.
+-import-ls|Import into the current DB from a ZIP file. The data is expected in the same format as in the `-export-ls` case.
+
+By default the server will start after the import/export operation. This can be disabled by using the `-no-serv` parameter.
+
+Format for normal import/export
+--
+The normal import/export will work on a single ZIP file which container a series of `.json` files. The name of each file will become a separate partition. Each of these `.json` files contains a single JSON object with the following structure:
+```
+{
+  nodes : [ { <attr> : <value>, ... }, ... ]
+  edges : [ { <attr> : <value>, ... }, ... ]
+}
+```
+When embedding EliasDB in another Go project this can be produced and consumed via `graph.ExportPartition` and `graph.ImportPartition`.
+
+Format for large scale import/export
+--
+The large scale