matchtree.go at 20a30090ee38bb8d57dd98a47b9f6184fe7dba3f · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchtree.go
at 20a30090ee38bb8d57dd98a47b9f6184fe7dba3f 26 kB View raw
Keegan Carruthers-Smith default to eager runeDocSection (#503) 3y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"fmt"
  19	"log"
  20	"strings"
  21	"unicode/utf8"
  22
  23	"github.com/grafana/regexp"
  24	"github.com/sourcegraph/zoekt/query"
  25)
  26
  27// A docIterator iterates over documents in order.
  28type docIterator interface {
  29	// provide the next document where we can may find something
  30	// interesting.
  31	nextDoc() uint32
  32
  33	// clears any per-document state of the docIterator, and
  34	// prepares for evaluating the given doc. The argument is
  35	// strictly increasing over time.
  36	prepare(nextDoc uint32)
  37}
  38
  39const (
  40	costConst   = 0
  41	costMemory  = 1
  42	costContent = 2
  43	costRegexp  = 3
  44)
  45
  46const (
  47	costMin = costConst
  48	costMax = costRegexp
  49)
  50
  51// An expression tree coupled with matches. The matchtree has two
  52// functions:
  53//
  54// * it implements boolean combinations (and, or, not)
  55//
  56// * it implements shortcuts, where we skip documents (for example: if
  57// there are no trigram matches, we can be sure there are no substring
  58// matches). The matchtree iterates over the documents as they are
  59// ordered in the shard.
  60//
  61// The general process for a given (shard, query) is:
  62//
  63// - construct matchTree for the query
  64//
  65// - find all different leaf matchTrees (substring, regexp, etc.)
  66//
  67// in a loop:
  68//
  69//   - find next doc to process using nextDoc
  70//
  71//   - evaluate atoms (leaf expressions that match text)
  72//
  73//   - evaluate the tree using matches(), storing the result in map.
  74//
  75//   - if the complete tree returns (matches() == true) for the document,
  76//     collect all text matches by looking at leaf matchTrees
  77type matchTree interface {
  78	docIterator
  79
  80	// returns whether this matches, and if we are sure.
  81	matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
  82}
  83
  84// docMatchTree iterates over documents for which predicate(docID) returns true.
  85type docMatchTree struct {
  86	// the number of documents in a shard.
  87	numDocs uint32
  88
  89	predicate func(docID uint32) bool
  90
  91	// provides additional information about the reason why the docMatchTree was
  92	// created.
  93	reason string
  94
  95	// mutable
  96	firstDone bool
  97	docID     uint32
  98}
  99
 100type bruteForceMatchTree struct {
 101	// mutable
 102	firstDone bool
 103	docID     uint32
 104}
 105
 106type andLineMatchTree struct {
 107	andMatchTree
 108}
 109
 110type andMatchTree struct {
 111	children []matchTree
 112}
 113
 114type orMatchTree struct {
 115	children []matchTree
 116}
 117
 118type notMatchTree struct {
 119	child matchTree
 120}
 121
 122// Returns only the filename of child matches.
 123type fileNameMatchTree struct {
 124	child matchTree
 125}
 126
 127// Don't visit this subtree for collecting matches.
 128type noVisitMatchTree struct {
 129	matchTree
 130}
 131
 132type regexpMatchTree struct {
 133	regexp *regexp.Regexp
 134
 135	fileName bool
 136
 137	// mutable
 138	reEvaluated bool
 139	found       []*candidateMatch
 140
 141	// nextDoc, prepare.
 142	bruteForceMatchTree
 143}
 144
 145type substrMatchTree struct {
 146	matchIterator
 147
 148	query         *query.Substring
 149	caseSensitive bool
 150	fileName      bool
 151
 152	// mutable
 153	current       []*candidateMatch
 154	contEvaluated bool
 155}
 156
 157type branchQueryMatchTree struct {
 158	fileMasks []uint64
 159	masks     []uint64
 160	repos     []uint16
 161
 162	// mutable
 163	firstDone bool
 164	docID     uint32
 165}
 166
 167type symbolRegexpMatchTree struct {
 168	matchTree
 169	regexp *regexp.Regexp
 170	all    bool // skips regex match if .*
 171
 172	reEvaluated bool
 173	found       []*candidateMatch
 174}
 175
 176func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 177	t.reEvaluated = false
 178}
 179
 180func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 181	if t.reEvaluated {
 182		return len(t.found) > 0, true
 183	}
 184
 185	if cost < costRegexp {
 186		return false, false
 187	}
 188
 189	sections := cp.docSections()
 190	content := cp.data(false)
 191
 192	found := t.found[:0]
 193	for i, sec := range sections {
 194		var idx []int
 195		if t.all {
 196			idx = []int{0, int(sec.End - sec.Start)}
 197		} else {
 198			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 199			if idx == nil {
 200				continue
 201			}
 202		}
 203
 204		cm := &candidateMatch{
 205			byteOffset:  sec.Start + uint32(idx[0]),
 206			byteMatchSz: uint32(idx[1] - idx[0]),
 207			symbol:      true,
 208			symbolIdx:   uint32(i),
 209		}
 210		found = append(found, cm)
 211	}
 212	t.found = found
 213	t.reEvaluated = true
 214
 215	return len(t.found) > 0, true
 216}
 217
 218type symbolSubstrMatchTree struct {
 219	*substrMatchTree
 220
 221	patternSize   uint32
 222	fileEndRunes  []uint32
 223	fileEndSymbol []uint32
 224
 225	doc      uint32
 226	sections []DocumentSection
 227
 228	secID uint32
 229}
 230
 231func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 232	t.substrMatchTree.prepare(doc)
 233	t.doc = doc
 234
 235	var fileStart uint32
 236	if doc > 0 {
 237		fileStart = t.fileEndRunes[doc-1]
 238	}
 239
 240	var sections []DocumentSection
 241	if len(t.sections) > 0 {
 242		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 243		if most == uint32(len(t.sections)) {
 244			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 245		} else {
 246			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 247				t.secID++
 248			}
 249
 250			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 251			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 252				symbolEnd++
 253			}
 254
 255			sections = t.sections[t.secID:symbolEnd]
 256		}
 257	}
 258
 259	secIdx := 0
 260	trimmed := t.current[:0]
 261	for len(sections) > secIdx && len(t.current) > 0 {
 262		start := fileStart + t.current[0].runeOffset
 263		end := start + t.patternSize
 264		if start >= sections[secIdx].End {
 265			secIdx++
 266			continue
 267		}
 268
 269		if start < sections[secIdx].Start {
 270			t.current = t.current[1:]
 271			continue
 272		}
 273
 274		if end <= sections[secIdx].End {
 275			t.current[0].symbol = true
 276			t.current[0].symbolIdx = uint32(secIdx)
 277			trimmed = append(trimmed, t.current[0])
 278		}
 279
 280		t.current = t.current[1:]
 281	}
 282	t.current = trimmed
 283}
 284
 285// all prepare methods
 286
 287func (t *bruteForceMatchTree) prepare(doc uint32) {
 288	t.docID = doc
 289	t.firstDone = true
 290}
 291
 292func (t *docMatchTree) prepare(doc uint32) {
 293	t.docID = doc
 294	t.firstDone = true
 295}
 296
 297func (t *andMatchTree) prepare(doc uint32) {
 298	for _, c := range t.children {
 299		c.prepare(doc)
 300	}
 301}
 302
 303func (t *regexpMatchTree) prepare(doc uint32) {
 304	t.found = t.found[:0]
 305	t.reEvaluated = false
 306	t.bruteForceMatchTree.prepare(doc)
 307}
 308
 309func (t *orMatchTree) prepare(doc uint32) {
 310	for _, c := range t.children {
 311		c.prepare(doc)
 312	}
 313}
 314
 315func (t *notMatchTree) prepare(doc uint32) {
 316	t.child.prepare(doc)
 317}
 318
 319func (t *fileNameMatchTree) prepare(doc uint32) {
 320	t.child.prepare(doc)
 321}
 322
 323func (t *substrMatchTree) prepare(nextDoc uint32) {
 324	t.matchIterator.prepare(nextDoc)
 325	t.current = t.matchIterator.candidates()
 326	t.contEvaluated = false
 327}
 328
 329func (t *branchQueryMatchTree) prepare(doc uint32) {
 330	t.firstDone = true
 331	t.docID = doc
 332}
 333
 334// nextDoc
 335
 336func (t *docMatchTree) nextDoc() uint32 {
 337	var start uint32
 338	if t.firstDone {
 339		start = t.docID + 1
 340	}
 341	for i := start; i < t.numDocs; i++ {
 342		if t.predicate(i) {
 343			return i
 344		}
 345	}
 346	return maxUInt32
 347}
 348
 349func (t *bruteForceMatchTree) nextDoc() uint32 {
 350	if !t.firstDone {
 351		return 0
 352	}
 353	return t.docID + 1
 354}
 355
 356func (t *andMatchTree) nextDoc() uint32 {
 357	var max uint32
 358	for _, c := range t.children {
 359		m := c.nextDoc()
 360		if m > max {
 361			max = m
 362		}
 363	}
 364	return max
 365}
 366
 367func (t *orMatchTree) nextDoc() uint32 {
 368	min := uint32(maxUInt32)
 369	for _, c := range t.children {
 370		m := c.nextDoc()
 371		if m < min {
 372			min = m
 373		}
 374	}
 375	return min
 376}
 377
 378func (t *notMatchTree) nextDoc() uint32 {
 379	return 0
 380}
 381
 382func (t *fileNameMatchTree) nextDoc() uint32 {
 383	return t.child.nextDoc()
 384}
 385
 386func (t *branchQueryMatchTree) nextDoc() uint32 {
 387	var start uint32
 388	if t.firstDone {
 389		start = t.docID + 1
 390	}
 391
 392	for i := start; i < uint32(len(t.fileMasks)); i++ {
 393		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 394			return i
 395		}
 396	}
 397	return maxUInt32
 398}
 399
 400// all String methods
 401
 402func (t *bruteForceMatchTree) String() string {
 403	return "all"
 404}
 405
 406func (t *docMatchTree) String() string {
 407	return fmt.Sprintf("doc(%s)", t.reason)
 408}
 409
 410func (t *andMatchTree) String() string {
 411	return fmt.Sprintf("and%v", t.children)
 412}
 413
 414func (t *regexpMatchTree) String() string {
 415	f := ""
 416	if t.fileName {
 417		f = "f"
 418	}
 419	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 420}
 421
 422func (t *orMatchTree) String() string {
 423	return fmt.Sprintf("or%v", t.children)
 424}
 425
 426func (t *notMatchTree) String() string {
 427	return fmt.Sprintf("not(%v)", t.child)
 428}
 429
 430func (t *noVisitMatchTree) String() string {
 431	return fmt.Sprintf("novisit(%v)", t.matchTree)
 432}
 433
 434func (t *fileNameMatchTree) String() string {
 435	return fmt.Sprintf("f(%v)", t.child)
 436}
 437
 438func (t *substrMatchTree) String() string {
 439	f := ""
 440	if t.fileName {
 441		f = "f"
 442	}
 443
 444	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 445}
 446
 447func (t *branchQueryMatchTree) String() string {
 448	return fmt.Sprintf("branch(%x)", t.masks)
 449}
 450
 451func (t *symbolSubstrMatchTree) String() string {
 452	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 453}
 454
 455func (t *symbolRegexpMatchTree) String() string {
 456	return fmt.Sprintf("symbol(%v)", t.matchTree)
 457}
 458
 459// visitMatches visits all atoms in matchTree. Note: This visits
 460// noVisitMatchTree. For collecting matches use visitMatches.
 461func visitMatchTree(t matchTree, f func(matchTree)) {
 462	switch s := t.(type) {
 463	case *andMatchTree:
 464		for _, ch := range s.children {
 465			visitMatchTree(ch, f)
 466		}
 467	case *orMatchTree:
 468		for _, ch := range s.children {
 469			visitMatchTree(ch, f)
 470		}
 471	case *andLineMatchTree:
 472		visitMatchTree(&s.andMatchTree, f)
 473	case *noVisitMatchTree:
 474		visitMatchTree(s.matchTree, f)
 475	case *notMatchTree:
 476		visitMatchTree(s.child, f)
 477	case *fileNameMatchTree:
 478		visitMatchTree(s.child, f)
 479	case *symbolSubstrMatchTree:
 480		visitMatchTree(s.substrMatchTree, f)
 481	case *symbolRegexpMatchTree:
 482		visitMatchTree(s.matchTree, f)
 483	default:
 484		f(t)
 485	}
 486}
 487
 488// visitMatches visits all atoms which can contribute matches. Note: This
 489// skips noVisitMatchTree.
 490func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 491	switch s := t.(type) {
 492	case *andMatchTree:
 493		for _, ch := range s.children {
 494			if known[ch] {
 495				visitMatches(ch, known, f)
 496			}
 497		}
 498	case *andLineMatchTree:
 499		visitMatches(&s.andMatchTree, known, f)
 500	case *orMatchTree:
 501		for _, ch := range s.children {
 502			if known[ch] {
 503				visitMatches(ch, known, f)
 504			}
 505		}
 506	case *symbolSubstrMatchTree:
 507		visitMatches(s.substrMatchTree, known, f)
 508	case *notMatchTree:
 509	case *noVisitMatchTree:
 510		// don't collect into negative trees.
 511	case *fileNameMatchTree:
 512		// We will just gather the filename if we do not visit this tree.
 513	default:
 514		f(s)
 515	}
 516}
 517
 518// all matches() methods.
 519
 520func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 521	return t.predicate(cp.idx), true
 522}
 523
 524func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 525	return true, true
 526}
 527
 528// andLineMatchTree is a performance optimization of andMatchTree. For content
 529// searches we don't want to run the regex engine if there is no line that
 530// contains matches from all terms.
 531func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 532	matches, sure := t.andMatchTree.matches(cp, cost, known)
 533	if !(sure && matches) {
 534		return matches, sure
 535	}
 536
 537	// find child with fewest candidates
 538	min := maxUInt32
 539	fewestChildren := 0
 540	for ix, child := range t.children {
 541		v, ok := child.(*substrMatchTree)
 542		// make sure we are running a content search and that all candidates are a
 543		// substrMatchTree
 544		if !ok || v.fileName {
 545			return matches, sure
 546		}
 547		if len(v.current) < min {
 548			min = len(v.current)
 549			fewestChildren = ix
 550		}
 551	}
 552
 553	type lineRange struct {
 554		start int
 555		end   int
 556	}
 557	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 558	prev := -1
 559	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 560		line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
 561		if line == prev {
 562			continue
 563		}
 564		prev = line
 565		lines = append(lines, lineRange{byteStart, byteEnd})
 566	}
 567
 568	// children keeps track of the children's candidates we have already seen.
 569	children := make([][]*candidateMatch, 0, len(t.children)-1)
 570	for j, child := range t.children {
 571		if j == fewestChildren {
 572			continue
 573		}
 574		children = append(children, child.(*substrMatchTree).current)
 575	}
 576
 577nextLine:
 578	for i := 0; i < len(lines); i++ {
 579		hits := 1
 580	nextChild:
 581		for j := range children {
 582		nextCandidate:
 583			for len(children[j]) > 0 {
 584				candidate := children[j][0]
 585				bo := int(cp.findOffset(false, candidate.runeOffset))
 586				if bo < lines[i].start {
 587					children[j] = children[j][1:]
 588					continue nextCandidate
 589				}
 590				if bo <= lines[i].end {
 591					hits++
 592					continue nextChild
 593				}
 594				// move the `lines` iterator forward until bo <= line.end
 595				for i < len(lines) && bo > lines[i].end {
 596					i++
 597				}
 598				i--
 599				continue nextLine
 600			}
 601		}
 602		// return early once we found any line that contains matches from all children
 603		if hits == len(t.children) {
 604			return matches, true
 605		}
 606	}
 607	return false, true
 608}
 609
 610func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 611	sure := true
 612
 613	for _, ch := range t.children {
 614		v, ok := evalMatchTree(cp, cost, known, ch)
 615		if ok && !v {
 616			return false, true
 617		}
 618		if !ok {
 619			sure = false
 620		}
 621	}
 622
 623	return true, sure
 624}
 625
 626func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 627	matches := false
 628	sure := true
 629	for _, ch := range t.children {
 630		v, ok := evalMatchTree(cp, cost, known, ch)
 631		if ok {
 632			// we could short-circuit, but we want to use
 633			// the other possibilities as a ranking
 634			// signal.
 635			matches = matches || v
 636		} else {
 637			sure = false
 638		}
 639	}
 640	return matches, sure
 641}
 642
 643func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 644	return t.fileMasks[t.docID]&t.masks[t.repos[t.docID]] != 0, true
 645}
 646
 647func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 648	if t.reEvaluated {
 649		return len(t.found) > 0, true
 650	}
 651
 652	if cost < costRegexp {
 653		return false, false
 654	}
 655
 656	cp.stats.RegexpsConsidered++
 657	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 658	found := t.found[:0]
 659	for _, idx := range idxs {
 660		cm := &candidateMatch{
 661			byteOffset:  uint32(idx[0]),
 662			byteMatchSz: uint32(idx[1] - idx[0]),
 663			fileName:    t.fileName,
 664		}
 665
 666		found = append(found, cm)
 667	}
 668	t.found = found
 669	t.reEvaluated = true
 670
 671	return len(t.found) > 0, true
 672}
 673
 674// breakMatchesOnNewlines returns matches resulting from breaking each element
 675// of cms on newlines within text.
 676func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 677	var lineCMs []*candidateMatch
 678	for _, cm := range cms {
 679		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 680	}
 681	return lineCMs
 682}
 683
 684// breakOnNewlines returns matches resulting from breaking cm on newlines
 685// within text.
 686func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 687	var cms []*candidateMatch
 688	addMe := &candidateMatch{}
 689	*addMe = *cm
 690	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 691		if text[i] == '\n' {
 692			addMe.byteMatchSz = i - addMe.byteOffset
 693			if addMe.byteMatchSz != 0 {
 694				cms = append(cms, addMe)
 695			}
 696
 697			addMe = &candidateMatch{}
 698			*addMe = *cm
 699			addMe.byteOffset = i + 1
 700		}
 701	}
 702	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 703	if addMe.byteMatchSz != 0 {
 704		cms = append(cms, addMe)
 705	}
 706	return cms
 707}
 708
 709func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
 710	if v, ok := known[mt]; ok {
 711		return v, true
 712	}
 713
 714	v, ok := mt.matches(cp, cost, known)
 715	if ok {
 716		known[mt] = v
 717	}
 718
 719	return v, ok
 720}
 721
 722func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 723	v, ok := evalMatchTree(cp, cost, known, t.child)
 724	return !v, ok
 725}
 726
 727func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 728	return evalMatchTree(cp, cost, known, t.child)
 729}
 730
 731func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 732	if t.contEvaluated {
 733		return len(t.current) > 0, true
 734	}
 735
 736	if len(t.current) == 0 {
 737		return false, true
 738	}
 739
 740	if t.fileName && cost < costMemory {
 741		return false, false
 742	}
 743
 744	if !t.fileName && cost < costContent {
 745		return false, false
 746	}
 747
 748	pruned := t.current[:0]
 749	for _, m := range t.current {
 750		if m.byteOffset == 0 && m.runeOffset > 0 {
 751			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 752		}
 753		if m.matchContent(cp.data(m.fileName)) {
 754			pruned = append(pruned, m)
 755		}
 756	}
 757	t.current = pruned
 758	t.contEvaluated = true
 759
 760	return len(t.current) > 0, true
 761}
 762
 763func (d *indexData) newMatchTree(q query.Q) (matchTree, error) {
 764	if q == nil {
 765		return nil, fmt.Errorf("got nil (sub)query")
 766	}
 767	switch s := q.(type) {
 768	case *query.Regexp:
 769		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 770		// superset of the regexp. If the returned matchTree is equivalent to the
 771		// original regexp, it returns true. An equivalent matchTree has the same
 772		// behaviour as the original regexp and can be used instead.
 773		//
 774		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 775		if err != nil {
 776			return nil, err
 777		}
 778		// if the query can be used in place of the regexp
 779		// return the subtree
 780		if isEq {
 781			return subMT, nil
 782		}
 783
 784		prefix := ""
 785		if !s.CaseSensitive {
 786			prefix = "(?i)"
 787		}
 788
 789		tr := &regexpMatchTree{
 790			regexp:   regexp.MustCompile(prefix + s.Regexp.String()),
 791			fileName: s.FileName,
 792		}
 793
 794		return &andMatchTree{
 795			children: []matchTree{
 796				tr, &noVisitMatchTree{subMT},
 797			},
 798		}, nil
 799	case *query.And:
 800		var r []matchTree
 801		for _, ch := range s.Children {
 802			ct, err := d.newMatchTree(ch)
 803			if err != nil {
 804				return nil, err
 805			}
 806			r = append(r, ct)
 807		}
 808		return &andMatchTree{r}, nil
 809	case *query.Or:
 810		var r []matchTree
 811		for _, ch := range s.Children {
 812			ct, err := d.newMatchTree(ch)
 813			if err != nil {
 814				return nil, err
 815			}
 816			r = append(r, ct)
 817		}
 818		return &orMatchTree{r}, nil
 819	case *query.Not:
 820		ct, err := d.newMatchTree(s.Child)
 821		return &notMatchTree{
 822			child: ct,
 823		}, err
 824
 825	case *query.Type:
 826		if s.Type != query.TypeFileName {
 827			break
 828		}
 829
 830		ct, err := d.newMatchTree(s.Child)
 831		if err != nil {
 832			return nil, err
 833		}
 834
 835		return &fileNameMatchTree{
 836			child: ct,
 837		}, nil
 838
 839	case *query.Substring:
 840		return d.newSubstringMatchTree(s)
 841
 842	case *query.Branch:
 843		masks := make([]uint64, 0, len(d.repoMetaData))
 844		if s.Pattern == "HEAD" {
 845			for i := 0; i < len(d.repoMetaData); i++ {
 846				masks = append(masks, 1)
 847			}
 848		} else {
 849			for _, branchIDs := range d.branchIDs {
 850				mask := uint64(0)
 851				for nm, m := range branchIDs {
 852					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
 853						mask |= uint64(m)
 854					}
 855				}
 856				masks = append(masks, mask)
 857			}
 858
 859		}
 860		return &branchQueryMatchTree{
 861			masks:     masks,
 862			fileMasks: d.fileBranchMasks,
 863			repos:     d.repos,
 864		}, nil
 865	case *query.Const:
 866		if s.Value {
 867			return &bruteForceMatchTree{}, nil
 868		} else {
 869			return &noMatchTree{"const"}, nil
 870		}
 871	case *query.Language:
 872		code, ok := d.metaData.LanguageMap[s.Language]
 873		if !ok {
 874			return &noMatchTree{"lang"}, nil
 875		}
 876		return &docMatchTree{
 877			reason:  "language",
 878			numDocs: d.numDocs(),
 879			predicate: func(docID uint32) bool {
 880				return d.getLanguage(docID) == code
 881			},
 882		}, nil
 883
 884	case *query.Symbol:
 885		subMT, err := d.newMatchTree(s.Expr)
 886		if err != nil {
 887			return nil, err
 888		}
 889
 890		if substr, ok := subMT.(*substrMatchTree); ok {
 891			// We have a feature flag for lazy decoding. If runeDocSectionsRaw is
 892			// non-nil it means we need to lazily decode on request.
 893			sections := d.runeDocSections
 894			if sections == nil && d.runeDocSectionsRaw != nil {
 895				sections = unmarshalDocSections(d.runeDocSectionsRaw, nil)
 896			}
 897
 898			return &symbolSubstrMatchTree{
 899				substrMatchTree: substr,
 900				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
 901				fileEndRunes:    d.fileEndRunes,
 902				fileEndSymbol:   d.fileEndSymbol,
 903				sections:        sections,
 904			}, nil
 905		}
 906
 907		var regexp *regexp.Regexp
 908		visitMatchTree(subMT, func(mt matchTree) {
 909			if t, ok := mt.(*regexpMatchTree); ok {
 910				regexp = t.regexp
 911			}
 912		})
 913		if regexp == nil {
 914			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
 915		}
 916
 917		return &symbolRegexpMatchTree{
 918			regexp:    regexp,
 919			all:       regexp.String() == "(?i)(?-s:.)*",
 920			matchTree: subMT,
 921		}, nil
 922
 923	case *query.FileNameSet:
 924		return &docMatchTree{
 925			reason:  "FileNameSet",
 926			numDocs: d.numDocs(),
 927			predicate: func(docID uint32) bool {
 928				fileName := d.fileName(docID)
 929				_, ok := s.Set[string(fileName)]
 930				return ok
 931			},
 932		}, nil
 933
 934	case *query.BranchesRepos:
 935		reposBranchesWant := make([]uint64, len(d.repoMetaData))
 936		for repoIdx := range d.repoMetaData {
 937			var mask uint64
 938			for _, br := range s.List {
 939				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
 940					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
 941				}
 942			}
 943			reposBranchesWant[repoIdx] = mask
 944		}
 945		return &docMatchTree{
 946			reason:  "BranchesRepos",
 947			numDocs: d.numDocs(),
 948			predicate: func(docID uint32) bool {
 949				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
 950			},
 951		}, nil
 952
 953	case *query.RepoSet:
 954		reposWant := make([]bool, len(d.repoMetaData))
 955		for repoIdx, r := range d.repoMetaData {
 956			if _, ok := s.Set[r.Name]; ok {
 957				reposWant[repoIdx] = true
 958			}
 959		}
 960		return &docMatchTree{
 961			reason:  "RepoSet",
 962			numDocs: d.numDocs(),
 963			predicate: func(docID uint32) bool {
 964				return reposWant[d.repos[docID]]
 965			},
 966		}, nil
 967
 968	case *query.Repo:
 969		reposWant := make([]bool, len(d.repoMetaData))
 970		for repoIdx, r := range d.repoMetaData {
 971			if s.Regexp.MatchString(r.Name) {
 972				reposWant[repoIdx] = true
 973			}
 974		}
 975		return &docMatchTree{
 976			reason:  "Repo",
 977			numDocs: d.numDocs(),
 978			predicate: func(docID uint32) bool {
 979				return reposWant[d.repos[docID]]
 980			},
 981		}, nil
 982
 983	case *query.RepoRegexp:
 984		reposWant := make([]bool, len(d.repoMetaData))
 985		for repoIdx, r := range d.repoMetaData {
 986			if s.Regexp.MatchString(r.Name) {
 987				reposWant[repoIdx] = true
 988			}
 989		}
 990		return &docMatchTree{
 991			reason:  "RepoRegexp",
 992			numDocs: d.numDocs(),
 993			predicate: func(docID uint32) bool {
 994				return reposWant[d.repos[docID]]
 995			},
 996		}, nil
 997
 998	case query.RawConfig:
 999		return &docMatchTree{
1000			reason:  s.String(),
1001			numDocs: d.numDocs(),
1002			predicate: func(docID uint32) bool {
1003				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1004			},
1005		}, nil
1006	}
1007	log.Panicf("type %T", q)
1008	return nil, nil
1009}
1010
1011func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1012	st := &substrMatchTree{
1013		query:         s,
1014		caseSensitive: s.CaseSensitive,
1015		fileName:      s.FileName,
1016	}
1017
1018	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1019		prefix := ""
1020		if !s.CaseSensitive {
1021			prefix = "(?i)"
1022		}
1023		t := &regexpMatchTree{
1024			regexp:   regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1025			fileName: s.FileName,
1026		}
1027		return t, nil
1028	}
1029
1030	result, err := d.iterateNgrams(s)
1031	if err != nil {
1032		return nil, err
1033	}
1034	st.matchIterator = result
1035	return st, nil
1036}
1037
1038// pruneMatchTree removes impossible branches from the matchTree, as indicated
1039// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1040func pruneMatchTree(mt matchTree) (matchTree, error) {
1041	var err error
1042	switch mt := mt.(type) {
1043	// leaf nodes that we test with the filter:
1044	case *substrMatchTree:
1045		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1046			if _, ok := res.matchIterator.(*noMatchTree); ok {
1047				return nil, nil
1048			}
1049		}
1050	// recursive tree structures:
1051	case *andMatchTree:
1052		// Any branch of an and becoming impossible means the entire clause
1053		// is impossible. Otherwise, just handle rewrites.
1054		for i, child := range mt.children {
1055			newChild, err := pruneMatchTree(child)
1056			if err != nil {
1057				return nil, err
1058			}
1059			if newChild == nil {
1060				return nil, nil
1061			}
1062			mt.children[i] = newChild
1063		}
1064	case *orMatchTree:
1065		// *All* branches of an OR becoming impossible means the entire clause
1066		// is impossible. Otherwise, drop impossible subclauses and handle
1067		// rewrites, including simplifying to a singular resulting child branch.
1068		n := 0
1069		for _, child := range mt.children {
1070			newChild, err := pruneMatchTree(child)
1071			if err != nil {
1072				return nil, err
1073			}
1074			if newChild != nil {
1075				mt.children[n] = newChild
1076				n++
1077			}
1078		}
1079		mt.children = mt.children[:n]
1080		if len(mt.children) == 1 {
1081			return mt.children[0], nil
1082		} else if len(mt.children) == 0 {
1083			return nil, nil
1084		}
1085	case *noVisitMatchTree:
1086		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1087		if err != nil {
1088			return nil, err
1089		}
1090		if mt.matchTree == nil {
1091			return nil, nil
1092		}
1093	case *fileNameMatchTree:
1094		mt.child, err = pruneMatchTree(mt.child)
1095	case *andLineMatchTree:
1096		child, err := pruneMatchTree(&mt.andMatchTree)
1097		if err != nil {
1098			return nil, err
1099		}
1100		if child == nil {
1101			return nil, nil
1102		}
1103		if c, ok := child.(*andMatchTree); ok {
1104			mt.andMatchTree = *c
1105		} else {
1106			// the and was simplified to a single clause,
1107			// so the linematch portion is irrelevant.
1108			return mt, nil
1109		}
1110	case *notMatchTree:
1111		mt.child, err = pruneMatchTree(mt.child)
1112		if err != nil {
1113			return nil, err
1114		}
1115		if mt.child == nil {
1116			// not false => true
1117			return &bruteForceMatchTree{}, nil
1118		}
1119	// unhandled:
1120	case *docMatchTree:
1121	case *bruteForceMatchTree:
1122	case *regexpMatchTree:
1123	}
1124	return mt, err
1125}
Configure Feed

Configure Feed