aboutsummaryrefslogtreecommitdiff
path: root/internal
diff options
context:
space:
mode:
authorTill <2353100+S7evinK@users.noreply.github.com>2022-09-07 18:15:54 +0200
committerGitHub <noreply@github.com>2022-09-07 18:15:54 +0200
commitd5876abbe9f5484768f603ec91a567b8650e6e73 (patch)
treee8288bac7557a840ed636391ce5afbc7059dc993 /internal
parent31f4ae8997af7e939f505107341b86b2abd3fd9a (diff)
Fulltext implementation incl. config (#2480)
This adds the main component of the fulltext search. This PR doesn't do anything yet, besides creating an empty fulltextindex folder if enabled. Indexing events is done in a separate PR.
Diffstat (limited to 'internal')
-rw-r--r--internal/fulltext/bleve.go164
-rw-r--r--internal/fulltext/bleve_test.go250
-rw-r--r--internal/fulltext/bleve_wasm.go65
3 files changed, 479 insertions, 0 deletions
diff --git a/internal/fulltext/bleve.go b/internal/fulltext/bleve.go
new file mode 100644
index 00000000..b07c0e51
--- /dev/null
+++ b/internal/fulltext/bleve.go
@@ -0,0 +1,164 @@
+// Copyright 2022 The Matrix.org Foundation C.I.C.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !wasm
+// +build !wasm
+
+package fulltext
+
+import (
+ "strings"
+
+ "github.com/blevesearch/bleve/v2"
+ "github.com/blevesearch/bleve/v2/mapping"
+ "github.com/matrix-org/dendrite/setup/config"
+ "github.com/matrix-org/gomatrixserverlib"
+)
+
+// Search contains all existing bleve.Index
+type Search struct {
+ FulltextIndex bleve.Index
+}
+
+// IndexElement describes the layout of an element to index
+type IndexElement struct {
+ EventID string
+ RoomID string
+ Content string
+ ContentType string
+ StreamPosition int64
+}
+
+// SetContentType sets i.ContentType given an identifier
+func (i *IndexElement) SetContentType(v string) {
+ switch v {
+ case "m.room.message":
+ i.ContentType = "content.body"
+ case gomatrixserverlib.MRoomName:
+ i.ContentType = "content.name"
+ case gomatrixserverlib.MRoomTopic:
+ i.ContentType = "content.topic"
+ }
+}
+
+// New opens a new/existing fulltext index
+func New(cfg config.Fulltext) (fts *Search, err error) {
+ fts = &Search{}
+ fts.FulltextIndex, err = openIndex(cfg)
+ if err != nil {
+ return nil, err
+ }
+ return fts, nil
+}
+
+// Close closes the fulltext index
+func (f *Search) Close() error {
+ return f.FulltextIndex.Close()
+}
+
+// Index indexes the given elements
+func (f *Search) Index(elements ...IndexElement) error {
+ batch := f.FulltextIndex.NewBatch()
+
+ for _, element := range elements {
+ err := batch.Index(element.EventID, element)
+ if err != nil {
+ return err
+ }
+ }
+ return f.FulltextIndex.Batch(batch)
+}
+
+// Delete deletes an indexed element by the eventID
+func (f *Search) Delete(eventID string) error {
+ return f.FulltextIndex.Delete(eventID)
+}
+
+// Search searches the index given a search term, roomIDs and keys.
+func (f *Search) Search(term string, roomIDs, keys []string, limit, from int, orderByStreamPos bool) (*bleve.SearchResult, error) {
+ qry := bleve.NewConjunctionQuery()
+ termQuery := bleve.NewBooleanQuery()
+
+ terms := strings.Split(term, " ")
+ for _, term := range terms {
+ matchQuery := bleve.NewMatchQuery(term)
+ matchQuery.SetField("Content")
+ termQuery.AddMust(matchQuery)
+ }
+ qry.AddQuery(termQuery)
+
+ roomQuery := bleve.NewBooleanQuery()
+ for _, roomID := range roomIDs {
+ roomSearch := bleve.NewMatchQuery(roomID)
+ roomSearch.SetField("RoomID")
+ roomQuery.AddShould(roomSearch)
+ }
+ if len(roomIDs) > 0 {
+ qry.AddQuery(roomQuery)
+ }
+ keyQuery := bleve.NewBooleanQuery()
+ for _, key := range keys {
+ keySearch := bleve.NewMatchQuery(key)
+ keySearch.SetField("ContentType")
+ keyQuery.AddShould(keySearch)
+ }
+ if len(keys) > 0 {
+ qry.AddQuery(keyQuery)
+ }
+
+ s := bleve.NewSearchRequestOptions(qry, limit, from, false)
+ s.Fields = []string{"*"}
+ s.SortBy([]string{"_score"})
+ if orderByStreamPos {
+ s.SortBy([]string{"-StreamPosition"})
+ }
+
+ return f.FulltextIndex.Search(s)
+}
+
+func openIndex(cfg config.Fulltext) (bleve.Index, error) {
+ m := getMapping(cfg)
+ if cfg.InMemory {
+ return bleve.NewMemOnly(m)
+ }
+ if index, err := bleve.Open(string(cfg.IndexPath)); err == nil {
+ return index, nil
+ }
+
+ index, err := bleve.New(string(cfg.IndexPath), m)
+ if err != nil {
+ return nil, err
+ }
+ return index, nil
+}
+
+func getMapping(cfg config.Fulltext) *mapping.IndexMappingImpl {
+ enFieldMapping := bleve.NewTextFieldMapping()
+ enFieldMapping.Analyzer = cfg.Language
+
+ eventMapping := bleve.NewDocumentMapping()
+ eventMapping.AddFieldMappingsAt("Content", enFieldMapping)
+ eventMapping.AddFieldMappingsAt("StreamPosition", bleve.NewNumericFieldMapping())
+
+ // Index entries as is
+ idFieldMapping := bleve.NewKeywordFieldMapping()
+ eventMapping.AddFieldMappingsAt("ContentType", idFieldMapping)
+ eventMapping.AddFieldMappingsAt("RoomID", idFieldMapping)
+ eventMapping.AddFieldMappingsAt("EventID", idFieldMapping)
+
+ indexMapping := bleve.NewIndexMapping()
+ indexMapping.AddDocumentMapping("Event", eventMapping)
+ indexMapping.DefaultType = "Event"
+ return indexMapping
+}
diff --git a/internal/fulltext/bleve_test.go b/internal/fulltext/bleve_test.go
new file mode 100644
index 00000000..84a28242
--- /dev/null
+++ b/internal/fulltext/bleve_test.go
@@ -0,0 +1,250 @@
+// Copyright 2022 The Matrix.org Foundation C.I.C.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fulltext_test
+
+import (
+ "reflect"
+ "testing"
+
+ "github.com/matrix-org/gomatrixserverlib"
+ "github.com/matrix-org/util"
+
+ "github.com/matrix-org/dendrite/internal/fulltext"
+ "github.com/matrix-org/dendrite/setup/config"
+)
+
+func mustOpenIndex(t *testing.T, tempDir string) *fulltext.Search {
+ t.Helper()
+ cfg := config.Fulltext{}
+ cfg.Defaults(config.DefaultOpts{
+ Generate: true,
+ Monolithic: true,
+ })
+ if tempDir != "" {
+ cfg.IndexPath = config.Path(tempDir)
+ cfg.InMemory = false
+ }
+ fts, err := fulltext.New(cfg)
+ if err != nil {
+ t.Fatal("failed to open fulltext index:", err)
+ }
+ return fts
+}
+
+func mustAddTestData(t *testing.T, fts *fulltext.Search, firstStreamPos int64) (eventIDs, roomIDs []string) {
+ t.Helper()
+ // create some more random data
+ var batchItems []fulltext.IndexElement
+ streamPos := firstStreamPos
+
+ wantRoomID := util.RandomString(16)
+
+ for i := 0; i < 30; i++ {
+ streamPos++
+ eventID := util.RandomString(16)
+ // Create more data for the first room
+ if i > 15 {
+ wantRoomID = util.RandomString(16)
+ }
+ e := fulltext.IndexElement{
+ EventID: eventID,
+ RoomID: wantRoomID,
+ Content: "lorem ipsum",
+ StreamPosition: streamPos,
+ }
+ e.SetContentType("m.room.message")
+ batchItems = append(batchItems, e)
+ roomIDs = append(roomIDs, wantRoomID)
+ eventIDs = append(eventIDs, eventID)
+ }
+ e := fulltext.IndexElement{
+ EventID: util.RandomString(16),
+ RoomID: wantRoomID,
+ Content: "Roomname testing",
+ StreamPosition: streamPos,
+ }
+ e.SetContentType(gomatrixserverlib.MRoomName)
+ batchItems = append(batchItems, e)
+ e = fulltext.IndexElement{
+ EventID: util.RandomString(16),
+ RoomID: wantRoomID,
+ Content: "Room topic fulltext",
+ StreamPosition: streamPos,
+ }
+ e.SetContentType(gomatrixserverlib.MRoomTopic)
+ batchItems = append(batchItems, e)
+ if err := fts.Index(batchItems...); err != nil {
+ t.Fatalf("failed to batch insert elements: %v", err)
+ }
+ return eventIDs, roomIDs
+}
+
+func TestOpen(t *testing.T) {
+ dataDir := t.TempDir()
+ fts := mustOpenIndex(t, dataDir)
+ if err := fts.Close(); err != nil {
+ t.Fatal("unable to close fulltext index", err)
+ }
+
+ // open existing index
+ fts = mustOpenIndex(t, dataDir)
+ defer fts.Close()
+}
+
+func TestIndex(t *testing.T) {
+ fts := mustOpenIndex(t, "")
+ defer fts.Close()
+
+ // add some data
+ var streamPos int64 = 1
+ roomID := util.RandomString(8)
+ eventID := util.RandomString(16)
+ e := fulltext.IndexElement{
+ EventID: eventID,
+ RoomID: roomID,
+ Content: "lorem ipsum",
+ StreamPosition: streamPos,
+ }
+ e.SetContentType("m.room.message")
+
+ if err := fts.Index(e); err != nil {
+ t.Fatal("failed to index element", err)
+ }
+
+ // create some more random data
+ mustAddTestData(t, fts, streamPos)
+}
+
+func TestDelete(t *testing.T) {
+ fts := mustOpenIndex(t, "")
+ defer fts.Close()
+ eventIDs, roomIDs := mustAddTestData(t, fts, 0)
+ res1, err := fts.Search("lorem", roomIDs[:1], nil, 50, 0, false)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if err = fts.Delete(eventIDs[0]); err != nil {
+ t.Fatal(err)
+ }
+
+ res2, err := fts.Search("lorem", roomIDs[:1], nil, 50, 0, false)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if res1.Total <= res2.Total {
+ t.Fatalf("got unexpected result: %d <= %d", res1.Total, res2.Total)
+ }
+}
+
+func TestSearch(t *testing.T) {
+ type args struct {
+ term string
+ keys []string
+ limit int
+ from int
+ orderByStreamPos bool
+ roomIndex []int
+ }
+ tests := []struct {
+ name string
+ args args
+ wantCount int
+ wantErr bool
+ }{
+ {
+ name: "Can search for many results in one room",
+ wantCount: 16,
+ args: args{
+ term: "lorem",
+ roomIndex: []int{0},
+ limit: 20,
+ },
+ },
+ {
+ name: "Can search for one result in one room",
+ wantCount: 1,
+ args: args{
+ term: "lorem",
+ roomIndex: []int{16},
+ limit: 20,
+ },
+ },
+ {
+ name: "Can search for many results in multiple rooms",
+ wantCount: 17,
+ args: args{
+ term: "lorem",
+ roomIndex: []int{0, 16},
+ limit: 20,
+ },
+ },
+ {
+ name: "Can search for many results in all rooms, reversed",
+ wantCount: 30,
+ args: args{
+ term: "lorem",
+ limit: 30,
+ orderByStreamPos: true,
+ },
+ },
+ {
+ name: "Can search for specific search room name",
+ wantCount: 1,
+ args: args{
+ term: "testing",
+ roomIndex: []int{},
+ limit: 20,
+ keys: []string{"content.name"},
+ },
+ },
+ {
+ name: "Can search for specific search room topic",
+ wantCount: 1,
+ args: args{
+ term: "fulltext",
+ roomIndex: []int{},
+ limit: 20,
+ keys: []string{"content.topic"},
+ },
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ f := mustOpenIndex(t, "")
+ eventIDs, roomIDs := mustAddTestData(t, f, 0)
+ var searchRooms []string
+ for _, x := range tt.args.roomIndex {
+ searchRooms = append(searchRooms, roomIDs[x])
+ }
+ t.Logf("searching in rooms: %v - %v\n", searchRooms, tt.args.keys)
+
+ got, err := f.Search(tt.args.term, searchRooms, tt.args.keys, tt.args.limit, tt.args.from, tt.args.orderByStreamPos)
+ if (err != nil) != tt.wantErr {
+ t.Errorf("Search() error = %v, wantErr %v", err, tt.wantErr)
+ return
+ }
+ if !reflect.DeepEqual(len(got.Hits), tt.wantCount) {
+ t.Errorf("Search() got = %v, want %v", len(got.Hits), tt.wantCount)
+ }
+ if tt.args.orderByStreamPos {
+ if got.Hits[0].ID != eventIDs[29] {
+ t.Fatalf("expected ID %s, got %s", eventIDs[29], got.Hits[0].ID)
+ }
+ }
+ })
+ }
+}
diff --git a/internal/fulltext/bleve_wasm.go b/internal/fulltext/bleve_wasm.go
new file mode 100644
index 00000000..a69a8926
--- /dev/null
+++ b/internal/fulltext/bleve_wasm.go
@@ -0,0 +1,65 @@
+// Copyright 2022 The Matrix.org Foundation C.I.C.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fulltext
+
+import (
+ "github.com/matrix-org/dendrite/setup/config"
+ "time"
+)
+
+type Search struct{}
+type IndexElement struct {
+ EventID string
+ RoomID string
+ Content string
+ ContentType string
+ StreamPosition int64
+}
+
+type SearchResult struct {
+ Status interface{} `json:"status"`
+ Request *interface{} `json:"request"`
+ Hits []interface{} `json:"hits"`
+ Total uint64 `json:"total_hits"`
+ MaxScore float64 `json:"max_score"`
+ Took time.Duration `json:"took"`
+ Facets interface{} `json:"facets"`
+}
+
+func (i *IndexElement) SetContentType(v string) {}
+
+func New(cfg config.Fulltext) (fts *Search, err error) {
+ return &Search{}, nil
+}
+
+func (f *Search) Close() error {
+ return nil
+}
+
+func (f *Search) Index(e IndexElement) error {
+ return nil
+}
+
+func (f *Search) BatchIndex(elements []IndexElement) error {
+ return nil
+}
+
+func (f *Search) Delete(eventID string) error {
+ return nil
+}
+
+func (f *Search) Search(term string, roomIDs, keys []string, limit, from int, orderByStreamPos bool) (SearchResult, error) {
+ return SearchResult{}, nil
+}