fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Switch to universal-ctags sandboxed mode.

This requires universal-ctags db3d9a6 or later

Change-Id: I03732ae511cfea5a027b8762dfa1b7a2f3a0eb53

+94 -385
+5 -12
README.md
··· 82 82 The webserver can be started from a standard service management framework, such 83 83 as systemd. 84 84 85 + 85 86 SYMBOL SEARCH 86 87 ============= 87 88 88 - It is recommended to install CTags to improve ranking: 89 - 90 - * [Universal ctags](https://github.com/universal-ctags/ctags) is more up to date, but not commonly packaged for distributions. It must be compiled from source. 91 - * [Exuberant ctags](http://ctags.sourceforge.net/) is a languishing, but commonly available through Linux distributions. It has several known vulnerabilities. 92 - 93 - If you index untrusted code, it is strongly recommended to also 94 - install Bazel's sandbox, to avoid vulnerabilities of ctags opening up 95 - access to the indexing machine. A blessed version of the sandbox is under 96 - `cmd/zoek-sandbox`. It can be compiled with a simple `make` call. 97 - 98 - 99 - 89 + It is recommended to install [Universal 90 + ctags](https://github.com/universal-ctags/ctags) to improve ranking, 91 + in particular, version `db3d9a6` or newer, which supports seccomp 92 + sandboxing. 100 93 101 94 102 95 ACKNOWLEDGEMENTS
-2
all.bash
··· 9 9 zoekt-mirror-gitiles zoekt-test; do 10 10 go install github.com/google/zoekt/cmd/$p 11 11 done 12 - 13 - (cd cmd/zoekt-sandbox && make && cp zoekt-sandbox $GOPATH/bin/ )
+18 -15
build/builder.go
··· 30 30 "sync" 31 31 32 32 "github.com/google/zoekt" 33 + "github.com/google/zoekt/ctags" 33 34 ) 34 35 35 36 var DefaultDir = filepath.Join(os.Getenv("HOME"), ".zoekt") ··· 69 70 // If set, ctags must succeed. 70 71 CTagsMustSucceed bool 71 72 72 - // Path to namespace-sandbox from Bazel 73 - NamespaceSandbox string 74 - 75 73 // Write memory profiles to this file. 76 74 MemProfile string 77 75 } ··· 85 83 todo []*zoekt.Document 86 84 size int 87 85 86 + parser ctags.Parser 87 + 88 88 building sync.WaitGroup 89 89 90 90 errMu sync.Mutex ··· 102 102 // SetDefaults sets reasonable default options. 103 103 func (o *Options) SetDefaults() { 104 104 if o.CTags == "" { 105 - ctags, err := exec.LookPath("ctags-universal") 105 + ctags, err := exec.LookPath("universal-ctags") 106 106 if err == nil { 107 107 o.CTags = ctags 108 108 } ··· 114 114 o.CTags = ctags 115 115 } 116 116 } 117 - if o.NamespaceSandbox == "" { 118 - ns, err := exec.LookPath("zoekt-sandbox") 119 - if err == nil { 120 - o.NamespaceSandbox = ns 121 - } 122 - } 123 117 if o.Parallelism == 0 { 124 118 o.Parallelism = 1 125 119 } ··· 179 173 } 180 174 181 175 // NewBuilder creates a new Builder instance. 182 - func NewBuilder(opt Options) (*Builder, error) { 183 - if opt.RepoDir == "" { 176 + func NewBuilder(opts Options) (*Builder, error) { 177 + if opts.RepoDir == "" { 184 178 return nil, fmt.Errorf("must set options.RepoDir") 185 179 } 186 180 187 181 b := &Builder{ 188 - opts: opt, 189 - throttle: make(chan int, opt.Parallelism), 182 + opts: opts, 183 + throttle: make(chan int, opts.Parallelism), 190 184 finishedShards: map[string]string{}, 191 185 } 192 186 187 + if strings.Contains(opts.CTags, "universal-ctags") { 188 + parser, err := ctags.NewParser(opts.CTags) 189 + if err != nil && opts.CTagsMustSucceed { 190 + return nil, fmt.Errorf("ctags.NewParser: %v", err) 191 + } 192 + 193 + b.parser = parser 194 + } 193 195 if _, err := b.newShardBuilder(); err != nil { 194 196 return nil, err 195 197 } ··· 330 332 if b.opts.CTags == "" && b.opts.CTagsMustSucceed { 331 333 return nil, fmt.Errorf("ctags binary not found, but CTagsMustSucceed set.") 332 334 } 335 + 333 336 if b.opts.CTags != "" { 334 - err := ctagsAddSymbols(todo, b.opts.CTags, b.opts.NamespaceSandbox) 337 + err := ctagsAddSymbols(todo, b.parser, b.opts.CTags) 335 338 if b.opts.CTagsMustSucceed && err != nil { 336 339 return nil, err 337 340 }
+20 -43
build/ctags.go
··· 28 28 "github.com/google/zoekt/ctags" 29 29 ) 30 30 31 - func runCTags(bin string, sandboxBin string, inputs map[string][]byte) ([]*ctags.Entry, error) { 31 + func runCTags(bin string, inputs map[string][]byte) ([]*ctags.Entry, error) { 32 32 const debug = false 33 33 if len(inputs) == 0 { 34 34 return nil, nil ··· 65 65 return nil, nil 66 66 } 67 67 68 - if sandboxBin != "" { 69 - // ctags parses untrusted input and is written in C. 70 - // Run it in a sandbox as defense in depth. The 71 - // namespace sandbox only works on Linux, 72 - // unfortunately. If someone packages a complex 73 - // exploit (eg. dirty COW) inside the repository, we 74 - // may still be SOL, but this is better than nothing. 75 - sandboxDir, err := ioutil.TempDir("", "") 76 - if err != nil { 77 - return nil, err 78 - } 79 - if !debug { 80 - defer os.RemoveAll(sandboxDir) 81 - } 82 - 83 - sandboxArgs := []string{ 84 - sandboxBin, 85 - "-s", sandboxDir, "-b", dir + "=input", "-d", "/input", 86 - // Make sure the binary is available in the sandbox. 87 - "-b", bin + "=" + "ctags", 88 - "-u", "3333", "-g", "3333", 89 - } 90 - args[0] = "/ctags" 91 - for _, d := range []string{"/lib", "/lib64"} { 92 - if _, err := os.Lstat(d); err == nil { 93 - sandboxArgs = append(sandboxArgs, "-b", d+"="+d[1:]) 94 - } 95 - } 96 - 97 - sandboxArgs = append(sandboxArgs, "-t", "tmp", "--") 98 - args = append(sandboxArgs, args...) 99 - } else { 100 - log.Println("WARNING: running ctags without sandboxing.") 101 - } 102 - 103 68 cmd := exec.Command(args[0], args[1:]...) 104 69 cmd.Dir = dir 105 70 ··· 145 110 return entries, nil 146 111 } 147 112 148 - func runCTagsChunked(bin, sandboxBin string, in map[string][]byte) ([]*ctags.Entry, error) { 113 + func runCTagsChunked(bin string, in map[string][]byte) ([]*ctags.Entry, error) { 149 114 var res []*ctags.Entry 150 115 151 116 cur := map[string][]byte{} ··· 156 121 157 122 // 100k seems reasonable. 158 123 if sz > (100 << 10) { 159 - r, err := runCTags(bin, sandboxBin, cur) 124 + r, err := runCTags(bin, cur) 160 125 if err != nil { 161 126 return nil, err 162 127 } ··· 166 131 sz = 0 167 132 } 168 133 } 169 - r, err := runCTags(bin, sandboxBin, cur) 134 + r, err := runCTags(bin, cur) 170 135 if err != nil { 171 136 return nil, err 172 137 } ··· 174 139 return res, nil 175 140 } 176 141 177 - func ctagsAddSymbols(todo []*zoekt.Document, bin, sandboxBin string) error { 142 + func ctagsAddSymbols(todo []*zoekt.Document, parser ctags.Parser, bin string) error { 178 143 pathIndices := map[string]int{} 179 144 contents := map[string][]byte{} 180 145 for i, t := range todo { ··· 191 156 contents[t.Name] = t.Content 192 157 } 193 158 194 - entries, err := runCTagsChunked(bin, sandboxBin, contents) 195 - if err != nil { 196 - return err 159 + var err error 160 + var entries []*ctags.Entry 161 + if parser != nil { 162 + for k, v := range contents { 163 + es, err := parser.Parse(k, v) 164 + if err != nil { 165 + return err 166 + } 167 + entries = append(entries, es...) 168 + } 169 + } else { 170 + entries, err = runCTagsChunked(bin, contents) 171 + if err != nil { 172 + return err 173 + } 197 174 } 198 175 199 176 fileTags := map[string][]*ctags.Entry{}
-2
cmd/zoekt-sandbox/Makefile
··· 1 - zoekt-sandbox: mkbox.c 2 - $(CC) -std=c99 -Wall -O1 -g -o $@ $<
-300
cmd/zoekt-sandbox/mkbox.c
··· 1 - /* mkbox.c 2 - * 3 - * Copyright 2014 Brian Swetland <swetland@frotz.net> 4 - * 5 - * Licensed under the Apache License, Version 2.0 (the "License"); 6 - * you may not use this file except in compliance with the License. 7 - * You may obtain a copy of the License at 8 - * 9 - * http://www.apache.org/licenses/LICENSE-2.0 10 - * 11 - * Unless required by applicable law or agreed to in writing, software 12 - * distributed under the License is distributed on an "AS IS" BASIS, 13 - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 - * See the License for the specific language governing permissions and 15 - * limitations under the License. 16 - */ 17 - 18 - #define _GNU_SOURCE 19 - 20 - #include <errno.h> 21 - #include <fcntl.h> 22 - #include <linux/capability.h> 23 - #include <sched.h> 24 - #include <stdio.h> 25 - #include <stdlib.h> 26 - #include <string.h> 27 - #include <sys/mount.h> 28 - #include <sys/prctl.h> 29 - #include <sys/stat.h> 30 - #include <sys/types.h> 31 - #include <sys/wait.h> 32 - #include <unistd.h> 33 - 34 - 35 - /* can't find headers for these, but they're in glibc... */ 36 - int pivot_root(const char *new_root, const char *put_old); 37 - 38 - /* provided by sys/capability.h (libcap-dev), but provided here for 39 - easy compilation. */ 40 - int capset(cap_user_header_t h, cap_user_data_t d); 41 - int capset(cap_user_header_t h, cap_user_data_t d); 42 - 43 - #define errorf(...) do { fprintf(stderr, __VA_ARGS__); exit(-1); } while (0) 44 - 45 - static int checkreturn(int res, const char *name, char *arg, int line) { 46 - if (res >= 0) 47 - return res; 48 - fprintf(stderr, "mkbox.c:%d: error: %s(%s) failed: r=%d errno=%d (%s)\n", 49 - line, name, arg, res, errno, strerror(errno)); 50 - exit(-1); 51 - } 52 - 53 - #define ok(fname, arg...) checkreturn(fname(arg), #fname, #arg, __LINE__) 54 - 55 - int dropcaps(void) { 56 - struct __user_cap_header_struct header; 57 - struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; 58 - header.version = _LINUX_CAPABILITY_VERSION_3; 59 - header.pid = 0; 60 - memset(data, 0, sizeof(data)); 61 - return capset(&header, data); 62 - } 63 - 64 - const char* my_domain = "localdomain"; 65 - const char* my_host = "localhost"; 66 - 67 - void recursive_mkdir(const char* dir, int mode) { 68 - int end = 0; 69 - 70 - while (dir[end] != '\0') { 71 - char path[1024] = {}; 72 - char *endp = strchrnul(dir + end + 1, '/'); 73 - strncpy(path, dir, endp - dir); 74 - end = endp - dir; 75 - 76 - struct stat buf; 77 - if (lstat(path, &buf) >= 0 && (buf.st_mode & S_IFDIR) != 0) { 78 - continue; 79 - } 80 - 81 - if (mkdir(path, mode) < 0) { 82 - fprintf(stderr, "mkdir(%s): %d", path, errno); 83 - exit(-1); 84 - } 85 - } 86 - } 87 - 88 - 89 - int main(int argc, char **argv) { 90 - uid_t uid = getuid(); 91 - gid_t gid = getgid(); 92 - const char* child_dir = NULL; 93 - const char* binary = NULL; 94 - int verbose = 1; 95 - 96 - /* Ask the kernel to kill us with SIGKILL if our parent dies. 97 - * this carries over to the process launched via execv(). 98 - */ 99 - ok(prctl, PR_SET_PDEATHSIG, SIGKILL); 100 - 101 - /* CLONE_NEWNET kills performance for short-lived processes, 102 - * see https://lkml.org/lkml/2014/8/28/656), but let's avoid 103 - * rogue processes contacting other hosts. */ 104 - int unshare_flags = CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWPID| 105 - CLONE_NEWIPC|CLONE_NEWUSER|CLONE_NEWNET; 106 - ok(unshare, unshare_flags); 107 - ok(setdomainname, my_domain, strlen(my_domain)); 108 - ok(sethostname, my_host, strlen(my_host)); 109 - int root_set = 0; 110 - int opt; 111 - while ((opt = getopt(argc, argv, "+b:B:d:D:g:qr:s:t:u:Z")) != -1) { 112 - switch (opt) { 113 - case 'q': /* quiet */ 114 - verbose = 0; 115 - break; 116 - case 's': // sandbox root directory 117 - /* ensure that changes to our mount namespace 118 - do not "leak" to outside namespaces (what 119 - mount --make-rprivate / does) 120 - */ 121 - mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL); 122 - 123 - /* mount the sandbox on top of itself in our 124 - new namespace. It will become our root 125 - filesystem */ 126 - ok(mount, optarg, optarg, NULL, MS_BIND|MS_NOSUID, NULL); 127 - 128 - /* step inside the to-be-root-directory */ 129 - if (verbose) { 130 - fprintf(stderr, "root dir: %s\n", optarg); 131 - } 132 - ok(chdir, optarg); 133 - root_set = 1; 134 - break; 135 - 136 - case 'B': /* binary to invoke */ 137 - binary = optarg; 138 - break; 139 - 140 - case 'b': // bind mount directory or file 141 - { 142 - char *dst = strchr(optarg, '='); 143 - if (dst == NULL) { 144 - errorf("argument must have '=': %s", optarg); 145 - } 146 - if (dst[1] == '/') { 147 - errorf("destination for %s must be relative to sandbox root.\n", optarg); 148 - } 149 - 150 - *dst = '\0'; 151 - dst++; 152 - char *src = optarg; 153 - 154 - if (verbose) { 155 - fprintf(stderr, "mount: %s => %s\n", src, dst); 156 - } 157 - 158 - struct stat buf = {}; 159 - ok(stat, src, &buf); 160 - 161 - if (S_ISDIR(buf.st_mode)) { 162 - if (lstat(dst, &buf) < 0) { 163 - recursive_mkdir(dst, 0755); 164 - } 165 - 166 - /* must use MS_REC, otherwise can't 167 - bind-mount a directory that has 168 - other directories mounted below. 169 - 170 - The submounts won't be affected by 171 - MS_REMOUNT | MS_READONLY, 172 - unfortunately. 173 - */ 174 - ok(mount, src, dst, NULL, MS_REC|MS_BIND, NULL); 175 - } else { 176 - /* create bind points. Don't use 177 - O_EXCL so we can debug by repeatedly 178 - calling the same command-line. */ 179 - ok(close, ok(open, dst, O_WRONLY|O_CREAT, 0666)); 180 - ok(mount, src, dst, NULL, MS_BIND, NULL); 181 - } 182 - } 183 - break; 184 - case 't': // setup tmp dir 185 - if (verbose) { 186 - fprintf(stderr, "tmp: %s\n", optarg); 187 - } 188 - struct stat buf = {}; 189 - if (lstat(optarg, &buf) < 0) { 190 - recursive_mkdir(optarg, 0755); 191 - } 192 - 193 - ok(mount, "sandbox-tmp", optarg, "tmpfs", 194 - MS_NOSUID|MS_NOEXEC|MS_NOATIME, 195 - "size=16m,nr_inodes=16k,mode=755"); 196 - break; 197 - 198 - 199 - case 'u': // set UID 200 - { 201 - char buf[1024]; 202 - int newuid = -1; 203 - if (sscanf(optarg, "%d", &newuid) != 1) { 204 - errorf("could not parse %s", optarg); 205 - } 206 - 207 - sprintf(buf, "%d %d 1\n", newuid, uid); 208 - int fd = ok(open, "/proc/self/uid_map", O_WRONLY); 209 - ok(write, fd, buf, strlen(buf)); 210 - ok(close, fd); 211 - ok(setresuid, newuid, newuid, newuid); 212 - } 213 - break; 214 - 215 - case 'g': // set GID. 216 - { 217 - char buf[1024]; 218 - /* write "deny" to 219 - /proc/self/setgroups in order for 220 - our unprivileged process to be able 221 - to write arbitrary group IDs to 222 - gid_map. 223 - 224 - this proc file doesn't exist in 225 - older Linux kernels, in which case 226 - the correct fallback is to just 227 - ignore it (because that signals 228 - that the additional security check 229 - that /proc/self/setgroups relates 230 - to doesn't exist it). 231 - */ 232 - int fd = open("/proc/self/setgroups", O_WRONLY); 233 - if (fd > 0) { 234 - strcpy(buf, "deny"); 235 - ok(write, fd, buf, strlen(buf)); 236 - ok(close, fd); 237 - } 238 - 239 - int newgid = -1; 240 - if (sscanf(optarg, "%d", &newgid) != 1) { 241 - errorf("could not parse %s", optarg); 242 - } 243 - 244 - sprintf(buf, "%d %d 1\n", newgid, gid); 245 - fd = ok(open, "/proc/self/gid_map", O_WRONLY); 246 - ok(write, fd, buf, strlen(buf)); 247 - ok(close, fd); 248 - 249 - /* initially we're nobody, change to new GID */ 250 - ok(setresgid, newgid, newgid, newgid); 251 - } 252 - break; 253 - 254 - case 'd': // dir for process 255 - child_dir = optarg; 256 - break; 257 - 258 - case 'D': 259 - /* create dir. Needed for creating dirs inside 260 - tmp/ , or bind mounts in subdirectories 261 - */ 262 - recursive_mkdir(optarg, 0755); 263 - break; 264 - 265 - default: 266 - errorf("option %c unknown", opt); 267 - 268 - } 269 - } 270 - 271 - if (!root_set) { 272 - errorf("-s option is mandatory"); 273 - } 274 - 275 - /* sandbox becomes our new root, detach the old one */ 276 - ok(mkdir, ".oldroot", 0755); 277 - ok(pivot_root, ".", ".oldroot"); 278 - 279 - /* pivot_root() may or may not affect its current working 280 - * directory. It is therefore recommended to call chdir("/") 281 - * immediately after pivot_root(). */ 282 - ok(chroot, "."); 283 - ok(umount2, ".oldroot", MNT_DETACH); 284 - ok(rmdir, ".oldroot"); 285 - 286 - /* remount root to finalize permissions */ 287 - ok(mount, "/", "/", NULL, 288 - MS_REMOUNT|MS_BIND|MS_NOEXEC|MS_NOSUID|MS_NODEV|MS_RDONLY, 289 - NULL); 290 - 291 - if (child_dir != NULL) { 292 - ok(chdir, child_dir); 293 - } 294 - 295 - ok(dropcaps); 296 - if (binary == NULL){ 297 - binary = argv[optind]; 298 - } 299 - ok(execv, binary, argv + optind); 300 - }
+46 -5
ctags/json.go
··· 18 18 "bufio" 19 19 "encoding/json" 20 20 "io" 21 + "log" 21 22 "os" 22 23 "os/exec" 23 24 "runtime" 25 + "strings" 24 26 "sync" 25 27 ) 28 + 29 + const debug = false 26 30 27 31 type ctagsProcess struct { 28 32 cmd *exec.Cmd ··· 34 38 procErr error 35 39 } 36 40 37 - var bin = "universal-ctags" 38 - 39 - func newProcess() (*ctagsProcess, error) { 41 + func newProcess(bin string) (*ctagsProcess, error) { 40 42 opt := "default" 41 43 if runtime.GOOS == "linux" { 42 44 opt = "sandbox" ··· 87 89 p.in.Close() 88 90 return err 89 91 } 92 + if debug { 93 + log.Printf("read %s", p.out.Text()) 94 + } 95 + 90 96 return json.Unmarshal(p.out.Bytes(), rep) 91 97 } 92 98 ··· 96 102 return err 97 103 } 98 104 body = append(body, '\n') 105 + if debug { 106 + log.Printf("post %q", body) 107 + } 108 + 99 109 if _, err = p.in.Write(body); err != nil { 100 110 return err 101 111 } 102 112 _, err = p.in.Write(content) 103 - 113 + if debug { 114 + log.Println(string(content)) 115 + } 104 116 return err 105 117 } 106 118 ··· 126 138 Kind string `json:"kind"` 127 139 } 128 140 129 - func (p *ctagsProcess) Process(name string, content []byte) ([]*Entry, error) { 141 + func (p *ctagsProcess) Parse(name string, content []byte) ([]*Entry, error) { 130 142 req := request{ 131 143 Command: "generate-tags", 132 144 Size: len(content), ··· 159 171 160 172 return es, nil 161 173 } 174 + 175 + type Parser interface { 176 + Parse(name string, content []byte) ([]*Entry, error) 177 + } 178 + 179 + type lockedParser struct { 180 + p Parser 181 + l sync.Mutex 182 + } 183 + 184 + func (lp *lockedParser) Parse(name string, content []byte) ([]*Entry, error) { 185 + lp.l.Lock() 186 + defer lp.l.Unlock() 187 + return lp.p.Parse(name, content) 188 + } 189 + 190 + func NewParser(bin string) (Parser, error) { 191 + if strings.Contains(bin, "universal-ctags") { 192 + // todo: restart, locking, parallelizatoin. 193 + proc, err := newProcess(bin) 194 + if err != nil { 195 + return nil, err 196 + } 197 + return &lockedParser{p: proc}, nil 198 + } 199 + 200 + log.Fatal("not implemented") 201 + return nil, nil 202 + }
+2 -2
ctags/json_test.go
··· 20 20 ) 21 21 22 22 func TestJSON(t *testing.T) { 23 - p, err := newProcess() 23 + p, err := newProcess("universal-ctags") 24 24 if err != nil { 25 25 t.Fatal("newProcess", err) 26 26 } ··· 42 42 } 43 43 ` 44 44 name := "io/zoekt/Back.java" 45 - got, err := p.Process(name, []byte(java)) 45 + got, err := p.Parse(name, []byte(java)) 46 46 if err != nil { 47 47 t.Errorf("Process: %v", err) 48 48 }
+3 -4
doc/design.md
··· 315 315 security problems: at worst, a bug in the query parser would lead to a 316 316 crash. 317 317 318 - As part of the indexing process, we run the code through tools like 319 - `ctags`. This poses a security risk: especially crafted code could be 320 - used to own the indexing process. We propose to mitigate this by 321 - runnning the tagger in a namespace-based sandbox on Linux. 318 + The code to index is handled by `ctags` for symbol detection. The 319 + security risk this poses is mitigated by using a seccomp based 320 + sandboxing. 322 321 323 322 324 323 Privacy