ipath_file_ops.c revision e35d710d0c5b74bc9833d6a3791706bd577a3724
1/* 2 * Copyright (c) 2006 QLogic, Inc. All rights reserved. 3 * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34#include <linux/pci.h> 35#include <linux/poll.h> 36#include <linux/cdev.h> 37#include <linux/swap.h> 38#include <linux/vmalloc.h> 39#include <asm/pgtable.h> 40 41#include "ipath_kernel.h" 42#include "ipath_common.h" 43 44static int ipath_open(struct inode *, struct file *); 45static int ipath_close(struct inode *, struct file *); 46static ssize_t ipath_write(struct file *, const char __user *, size_t, 47 loff_t *); 48static unsigned int ipath_poll(struct file *, struct poll_table_struct *); 49static int ipath_mmap(struct file *, struct vm_area_struct *); 50 51static struct file_operations ipath_file_ops = { 52 .owner = THIS_MODULE, 53 .write = ipath_write, 54 .open = ipath_open, 55 .release = ipath_close, 56 .poll = ipath_poll, 57 .mmap = ipath_mmap 58}; 59 60static int ipath_get_base_info(struct ipath_portdata *pd, 61 void __user *ubase, size_t ubase_size) 62{ 63 int ret = 0; 64 struct ipath_base_info *kinfo = NULL; 65 struct ipath_devdata *dd = pd->port_dd; 66 67 if (ubase_size < sizeof(*kinfo)) { 68 ipath_cdbg(PROC, 69 "Base size %lu, need %lu (version mismatch?)\n", 70 (unsigned long) ubase_size, 71 (unsigned long) sizeof(*kinfo)); 72 ret = -EINVAL; 73 goto bail; 74 } 75 76 kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL); 77 if (kinfo == NULL) { 78 ret = -ENOMEM; 79 goto bail; 80 } 81 82 ret = dd->ipath_f_get_base_info(pd, kinfo); 83 if (ret < 0) 84 goto bail; 85 86 kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt; 87 kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize; 88 kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt; 89 kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize; 90 /* 91 * have to mmap whole thing 92 */ 93 kinfo->spi_rcv_egrbuftotlen = 94 pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size; 95 kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk; 96 kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen / 97 pd->port_rcvegrbuf_chunks; 98 kinfo->spi_tidcnt = dd->ipath_rcvtidcnt; 99 /* 100 * for this use, may be ipath_cfgports summed over all chips that 101 * are are configured and present 102 */ 103 kinfo->spi_nports = dd->ipath_cfgports; 104 /* unit (chip/board) our port is on */ 105 kinfo->spi_unit = dd->ipath_unit; 106 /* for now, only a single page */ 107 kinfo->spi_tid_maxsize = PAGE_SIZE; 108 109 /* 110 * Doing this per port, and based on the skip value, etc. This has 111 * to be the actual buffer size, since the protocol code treats it 112 * as an array. 113 * 114 * These have to be set to user addresses in the user code via mmap. 115 * These values are used on return to user code for the mmap target 116 * addresses only. For 32 bit, same 44 bit address problem, so use 117 * the physical address, not virtual. Before 2.6.11, using the 118 * page_address() macro worked, but in 2.6.11, even that returns the 119 * full 64 bit address (upper bits all 1's). So far, using the 120 * physical addresses (or chip offsets, for chip mapping) works, but 121 * no doubt some future kernel release will chang that, and we'll be 122 * on to yet another method of dealing with this 123 */ 124 kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys; 125 kinfo->spi_rcvhdr_tailaddr = (u64)pd->port_rcvhdrqtailaddr_phys; 126 kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys; 127 kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys; 128 kinfo->spi_status = (u64) kinfo->spi_pioavailaddr + 129 (void *) dd->ipath_statusp - 130 (void *) dd->ipath_pioavailregs_dma; 131 kinfo->spi_piobufbase = (u64) pd->port_piobufs; 132 kinfo->__spi_uregbase = 133 dd->ipath_uregbase + dd->ipath_palign * pd->port_port; 134 135 kinfo->spi_pioindex = dd->ipath_pbufsport * (pd->port_port - 1); 136 kinfo->spi_piocnt = dd->ipath_pbufsport; 137 kinfo->spi_pioalign = dd->ipath_palign; 138 139 kinfo->spi_qpair = IPATH_KD_QP; 140 kinfo->spi_piosize = dd->ipath_ibmaxlen; 141 kinfo->spi_mtu = dd->ipath_ibmaxlen; /* maxlen, not ibmtu */ 142 kinfo->spi_port = pd->port_port; 143 kinfo->spi_sw_version = IPATH_KERN_SWVERSION; 144 kinfo->spi_hw_version = dd->ipath_revision; 145 146 if (copy_to_user(ubase, kinfo, sizeof(*kinfo))) 147 ret = -EFAULT; 148 149bail: 150 kfree(kinfo); 151 return ret; 152} 153 154/** 155 * ipath_tid_update - update a port TID 156 * @pd: the port 157 * @ti: the TID information 158 * 159 * The new implementation as of Oct 2004 is that the driver assigns 160 * the tid and returns it to the caller. To make it easier to 161 * catch bugs, and to reduce search time, we keep a cursor for 162 * each port, walking the shadow tid array to find one that's not 163 * in use. 164 * 165 * For now, if we can't allocate the full list, we fail, although 166 * in the long run, we'll allocate as many as we can, and the 167 * caller will deal with that by trying the remaining pages later. 168 * That means that when we fail, we have to mark the tids as not in 169 * use again, in our shadow copy. 170 * 171 * It's up to the caller to free the tids when they are done. 172 * We'll unlock the pages as they free them. 173 * 174 * Also, right now we are locking one page at a time, but since 175 * the intended use of this routine is for a single group of 176 * virtually contiguous pages, that should change to improve 177 * performance. 178 */ 179static int ipath_tid_update(struct ipath_portdata *pd, 180 const struct ipath_tid_info *ti) 181{ 182 int ret = 0, ntids; 183 u32 tid, porttid, cnt, i, tidcnt; 184 u16 *tidlist; 185 struct ipath_devdata *dd = pd->port_dd; 186 u64 physaddr; 187 unsigned long vaddr; 188 u64 __iomem *tidbase; 189 unsigned long tidmap[8]; 190 struct page **pagep = NULL; 191 192 if (!dd->ipath_pageshadow) { 193 ret = -ENOMEM; 194 goto done; 195 } 196 197 cnt = ti->tidcnt; 198 if (!cnt) { 199 ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n", 200 (unsigned long long) ti->tidlist); 201 /* 202 * Should we treat as success? likely a bug 203 */ 204 ret = -EFAULT; 205 goto done; 206 } 207 tidcnt = dd->ipath_rcvtidcnt; 208 if (cnt >= tidcnt) { 209 /* make sure it all fits in port_tid_pg_list */ 210 dev_info(&dd->pcidev->dev, "Process tried to allocate %u " 211 "TIDs, only trying max (%u)\n", cnt, tidcnt); 212 cnt = tidcnt; 213 } 214 pagep = (struct page **)pd->port_tid_pg_list; 215 tidlist = (u16 *) (&pagep[cnt]); 216 217 memset(tidmap, 0, sizeof(tidmap)); 218 tid = pd->port_tidcursor; 219 /* before decrement; chip actual # */ 220 porttid = pd->port_port * tidcnt; 221 ntids = tidcnt; 222 tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) + 223 dd->ipath_rcvtidbase + 224 porttid * sizeof(*tidbase)); 225 226 ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n", 227 pd->port_port, cnt, tid, tidbase); 228 229 /* virtual address of first page in transfer */ 230 vaddr = ti->tidvaddr; 231 if (!access_ok(VERIFY_WRITE, (void __user *) vaddr, 232 cnt * PAGE_SIZE)) { 233 ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n", 234 (void *)vaddr, cnt); 235 ret = -EFAULT; 236 goto done; 237 } 238 ret = ipath_get_user_pages(vaddr, cnt, pagep); 239 if (ret) { 240 if (ret == -EBUSY) { 241 ipath_dbg("Failed to lock addr %p, %u pages " 242 "(already locked)\n", 243 (void *) vaddr, cnt); 244 /* 245 * for now, continue, and see what happens but with 246 * the new implementation, this should never happen, 247 * unless perhaps the user has mpin'ed the pages 248 * themselves (something we need to test) 249 */ 250 ret = 0; 251 } else { 252 dev_info(&dd->pcidev->dev, 253 "Failed to lock addr %p, %u pages: " 254 "errno %d\n", (void *) vaddr, cnt, -ret); 255 goto done; 256 } 257 } 258 for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) { 259 for (; ntids--; tid++) { 260 if (tid == tidcnt) 261 tid = 0; 262 if (!dd->ipath_pageshadow[porttid + tid]) 263 break; 264 } 265 if (ntids < 0) { 266 /* 267 * oops, wrapped all the way through their TIDs, 268 * and didn't have enough free; see comments at 269 * start of routine 270 */ 271 ipath_dbg("Not enough free TIDs for %u pages " 272 "(index %d), failing\n", cnt, i); 273 i--; /* last tidlist[i] not filled in */ 274 ret = -ENOMEM; 275 break; 276 } 277 tidlist[i] = tid; 278 ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, " 279 "vaddr %lx\n", i, tid, vaddr); 280 /* we "know" system pages and TID pages are same size */ 281 dd->ipath_pageshadow[porttid + tid] = pagep[i]; 282 /* 283 * don't need atomic or it's overhead 284 */ 285 __set_bit(tid, tidmap); 286 physaddr = page_to_phys(pagep[i]); 287 ipath_stats.sps_pagelocks++; 288 ipath_cdbg(VERBOSE, 289 "TID %u, vaddr %lx, physaddr %llx pgp %p\n", 290 tid, vaddr, (unsigned long long) physaddr, 291 pagep[i]); 292 dd->ipath_f_put_tid(dd, &tidbase[tid], 1, physaddr); 293 /* 294 * don't check this tid in ipath_portshadow, since we 295 * just filled it in; start with the next one. 296 */ 297 tid++; 298 } 299 300 if (ret) { 301 u32 limit; 302 cleanup: 303 /* jump here if copy out of updated info failed... */ 304 ipath_dbg("After failure (ret=%d), undo %d of %d entries\n", 305 -ret, i, cnt); 306 /* same code that's in ipath_free_tid() */ 307 limit = sizeof(tidmap) * BITS_PER_BYTE; 308 if (limit > tidcnt) 309 /* just in case size changes in future */ 310 limit = tidcnt; 311 tid = find_first_bit((const unsigned long *)tidmap, limit); 312 for (; tid < limit; tid++) { 313 if (!test_bit(tid, tidmap)) 314 continue; 315 if (dd->ipath_pageshadow[porttid + tid]) { 316 ipath_cdbg(VERBOSE, "Freeing TID %u\n", 317 tid); 318 dd->ipath_f_put_tid(dd, &tidbase[tid], 1, 319 dd->ipath_tidinvalid); 320 dd->ipath_pageshadow[porttid + tid] = NULL; 321 ipath_stats.sps_pageunlocks++; 322 } 323 } 324 ipath_release_user_pages(pagep, cnt); 325 } else { 326 /* 327 * Copy the updated array, with ipath_tid's filled in, back 328 * to user. Since we did the copy in already, this "should 329 * never fail" If it does, we have to clean up... 330 */ 331 if (copy_to_user((void __user *) 332 (unsigned long) ti->tidlist, 333 tidlist, cnt * sizeof(*tidlist))) { 334 ret = -EFAULT; 335 goto cleanup; 336 } 337 if (copy_to_user((void __user *) (unsigned long) ti->tidmap, 338 tidmap, sizeof tidmap)) { 339 ret = -EFAULT; 340 goto cleanup; 341 } 342 if (tid == tidcnt) 343 tid = 0; 344 pd->port_tidcursor = tid; 345 } 346 347done: 348 if (ret) 349 ipath_dbg("Failed to map %u TID pages, failing with %d\n", 350 ti->tidcnt, -ret); 351 return ret; 352} 353 354/** 355 * ipath_tid_free - free a port TID 356 * @pd: the port 357 * @ti: the TID info 358 * 359 * right now we are unlocking one page at a time, but since 360 * the intended use of this routine is for a single group of 361 * virtually contiguous pages, that should change to improve 362 * performance. We check that the TID is in range for this port 363 * but otherwise don't check validity; if user has an error and 364 * frees the wrong tid, it's only their own data that can thereby 365 * be corrupted. We do check that the TID was in use, for sanity 366 * We always use our idea of the saved address, not the address that 367 * they pass in to us. 368 */ 369 370static int ipath_tid_free(struct ipath_portdata *pd, 371 const struct ipath_tid_info *ti) 372{ 373 int ret = 0; 374 u32 tid, porttid, cnt, limit, tidcnt; 375 struct ipath_devdata *dd = pd->port_dd; 376 u64 __iomem *tidbase; 377 unsigned long tidmap[8]; 378 379 if (!dd->ipath_pageshadow) { 380 ret = -ENOMEM; 381 goto done; 382 } 383 384 if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap, 385 sizeof tidmap)) { 386 ret = -EFAULT; 387 goto done; 388 } 389 390 porttid = pd->port_port * dd->ipath_rcvtidcnt; 391 tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + 392 dd->ipath_rcvtidbase + 393 porttid * sizeof(*tidbase)); 394 395 tidcnt = dd->ipath_rcvtidcnt; 396 limit = sizeof(tidmap) * BITS_PER_BYTE; 397 if (limit > tidcnt) 398 /* just in case size changes in future */ 399 limit = tidcnt; 400 tid = find_first_bit(tidmap, limit); 401 ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) " 402 "set is %d, porttid %u\n", pd->port_port, ti->tidcnt, 403 limit, tid, porttid); 404 for (cnt = 0; tid < limit; tid++) { 405 /* 406 * small optimization; if we detect a run of 3 or so without 407 * any set, use find_first_bit again. That's mainly to 408 * accelerate the case where we wrapped, so we have some at 409 * the beginning, and some at the end, and a big gap 410 * in the middle. 411 */ 412 if (!test_bit(tid, tidmap)) 413 continue; 414 cnt++; 415 if (dd->ipath_pageshadow[porttid + tid]) { 416 ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n", 417 pd->port_pid, tid); 418 dd->ipath_f_put_tid(dd, &tidbase[tid], 1, 419 dd->ipath_tidinvalid); 420 ipath_release_user_pages( 421 &dd->ipath_pageshadow[porttid + tid], 1); 422 dd->ipath_pageshadow[porttid + tid] = NULL; 423 ipath_stats.sps_pageunlocks++; 424 } else 425 ipath_dbg("Unused tid %u, ignoring\n", tid); 426 } 427 if (cnt != ti->tidcnt) 428 ipath_dbg("passed in tidcnt %d, only %d bits set in map\n", 429 ti->tidcnt, cnt); 430done: 431 if (ret) 432 ipath_dbg("Failed to unmap %u TID pages, failing with %d\n", 433 ti->tidcnt, -ret); 434 return ret; 435} 436 437/** 438 * ipath_set_part_key - set a partition key 439 * @pd: the port 440 * @key: the key 441 * 442 * We can have up to 4 active at a time (other than the default, which is 443 * always allowed). This is somewhat tricky, since multiple ports may set 444 * the same key, so we reference count them, and clean up at exit. All 4 445 * partition keys are packed into a single infinipath register. It's an 446 * error for a process to set the same pkey multiple times. We provide no 447 * mechanism to de-allocate a pkey at this time, we may eventually need to 448 * do that. I've used the atomic operations, and no locking, and only make 449 * a single pass through what's available. This should be more than 450 * adequate for some time. I'll think about spinlocks or the like if and as 451 * it's necessary. 452 */ 453static int ipath_set_part_key(struct ipath_portdata *pd, u16 key) 454{ 455 struct ipath_devdata *dd = pd->port_dd; 456 int i, any = 0, pidx = -1; 457 u16 lkey = key & 0x7FFF; 458 int ret; 459 460 if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) { 461 /* nothing to do; this key always valid */ 462 ret = 0; 463 goto bail; 464 } 465 466 ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys " 467 "%hx:%x %hx:%x %hx:%x %hx:%x\n", 468 pd->port_port, key, dd->ipath_pkeys[0], 469 atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1], 470 atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2], 471 atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3], 472 atomic_read(&dd->ipath_pkeyrefs[3])); 473 474 if (!lkey) { 475 ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n", 476 pd->port_port); 477 ret = -EINVAL; 478 goto bail; 479 } 480 481 /* 482 * Set the full membership bit, because it has to be 483 * set in the register or the packet, and it seems 484 * cleaner to set in the register than to force all 485 * callers to set it. (see bug 4331) 486 */ 487 key |= 0x8000; 488 489 for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { 490 if (!pd->port_pkeys[i] && pidx == -1) 491 pidx = i; 492 if (pd->port_pkeys[i] == key) { 493 ipath_cdbg(VERBOSE, "p%u tries to set same pkey " 494 "(%x) more than once\n", 495 pd->port_port, key); 496 ret = -EEXIST; 497 goto bail; 498 } 499 } 500 if (pidx == -1) { 501 ipath_dbg("All pkeys for port %u already in use, " 502 "can't set %x\n", pd->port_port, key); 503 ret = -EBUSY; 504 goto bail; 505 } 506 for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { 507 if (!dd->ipath_pkeys[i]) { 508 any++; 509 continue; 510 } 511 if (dd->ipath_pkeys[i] == key) { 512 atomic_t *pkrefs = &dd->ipath_pkeyrefs[i]; 513 514 if (atomic_inc_return(pkrefs) > 1) { 515 pd->port_pkeys[pidx] = key; 516 ipath_cdbg(VERBOSE, "p%u set key %x " 517 "matches #%d, count now %d\n", 518 pd->port_port, key, i, 519 atomic_read(pkrefs)); 520 ret = 0; 521 goto bail; 522 } else { 523 /* 524 * lost race, decrement count, catch below 525 */ 526 atomic_dec(pkrefs); 527 ipath_cdbg(VERBOSE, "Lost race, count was " 528 "0, after dec, it's %d\n", 529 atomic_read(pkrefs)); 530 any++; 531 } 532 } 533 if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) { 534 /* 535 * It makes no sense to have both the limited and 536 * full membership PKEY set at the same time since 537 * the unlimited one will disable the limited one. 538 */ 539 ret = -EEXIST; 540 goto bail; 541 } 542 } 543 if (!any) { 544 ipath_dbg("port %u, all pkeys already in use, " 545 "can't set %x\n", pd->port_port, key); 546 ret = -EBUSY; 547 goto bail; 548 } 549 for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { 550 if (!dd->ipath_pkeys[i] && 551 atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) { 552 u64 pkey; 553 554 /* for ipathstats, etc. */ 555 ipath_stats.sps_pkeys[i] = lkey; 556 pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key; 557 pkey = 558 (u64) dd->ipath_pkeys[0] | 559 ((u64) dd->ipath_pkeys[1] << 16) | 560 ((u64) dd->ipath_pkeys[2] << 32) | 561 ((u64) dd->ipath_pkeys[3] << 48); 562 ipath_cdbg(PROC, "p%u set key %x in #%d, " 563 "portidx %d, new pkey reg %llx\n", 564 pd->port_port, key, i, pidx, 565 (unsigned long long) pkey); 566 ipath_write_kreg( 567 dd, dd->ipath_kregs->kr_partitionkey, pkey); 568 569 ret = 0; 570 goto bail; 571 } 572 } 573 ipath_dbg("port %u, all pkeys already in use 2nd pass, " 574 "can't set %x\n", pd->port_port, key); 575 ret = -EBUSY; 576 577bail: 578 return ret; 579} 580 581/** 582 * ipath_manage_rcvq - manage a port's receive queue 583 * @pd: the port 584 * @start_stop: action to carry out 585 * 586 * start_stop == 0 disables receive on the port, for use in queue 587 * overflow conditions. start_stop==1 re-enables, to be used to 588 * re-init the software copy of the head register 589 */ 590static int ipath_manage_rcvq(struct ipath_portdata *pd, int start_stop) 591{ 592 struct ipath_devdata *dd = pd->port_dd; 593 u64 tval; 594 595 ipath_cdbg(PROC, "%sabling rcv for unit %u port %u\n", 596 start_stop ? "en" : "dis", dd->ipath_unit, 597 pd->port_port); 598 /* atomically clear receive enable port. */ 599 if (start_stop) { 600 /* 601 * On enable, force in-memory copy of the tail register to 602 * 0, so that protocol code doesn't have to worry about 603 * whether or not the chip has yet updated the in-memory 604 * copy or not on return from the system call. The chip 605 * always resets it's tail register back to 0 on a 606 * transition from disabled to enabled. This could cause a 607 * problem if software was broken, and did the enable w/o 608 * the disable, but eventually the in-memory copy will be 609 * updated and correct itself, even in the face of software 610 * bugs. 611 */ 612 *pd->port_rcvhdrtail_kvaddr = 0; 613 set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, 614 &dd->ipath_rcvctrl); 615 } else 616 clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, 617 &dd->ipath_rcvctrl); 618 ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, 619 dd->ipath_rcvctrl); 620 /* now be sure chip saw it before we return */ 621 tval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); 622 if (start_stop) { 623 /* 624 * And try to be sure that tail reg update has happened too. 625 * This should in theory interlock with the RXE changes to 626 * the tail register. Don't assign it to the tail register 627 * in memory copy, since we could overwrite an update by the 628 * chip if we did. 629 */ 630 tval = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); 631 } 632 /* always; new head should be equal to new tail; see above */ 633 return 0; 634} 635 636static void ipath_clean_part_key(struct ipath_portdata *pd, 637 struct ipath_devdata *dd) 638{ 639 int i, j, pchanged = 0; 640 u64 oldpkey; 641 642 /* for debugging only */ 643 oldpkey = (u64) dd->ipath_pkeys[0] | 644 ((u64) dd->ipath_pkeys[1] << 16) | 645 ((u64) dd->ipath_pkeys[2] << 32) | 646 ((u64) dd->ipath_pkeys[3] << 48); 647 648 for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { 649 if (!pd->port_pkeys[i]) 650 continue; 651 ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i, 652 pd->port_pkeys[i]); 653 for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) { 654 /* check for match independent of the global bit */ 655 if ((dd->ipath_pkeys[j] & 0x7fff) != 656 (pd->port_pkeys[i] & 0x7fff)) 657 continue; 658 if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) { 659 ipath_cdbg(VERBOSE, "p%u clear key " 660 "%x matches #%d\n", 661 pd->port_port, 662 pd->port_pkeys[i], j); 663 ipath_stats.sps_pkeys[j] = 664 dd->ipath_pkeys[j] = 0; 665 pchanged++; 666 } 667 else ipath_cdbg( 668 VERBOSE, "p%u key %x matches #%d, " 669 "but ref still %d\n", pd->port_port, 670 pd->port_pkeys[i], j, 671 atomic_read(&dd->ipath_pkeyrefs[j])); 672 break; 673 } 674 pd->port_pkeys[i] = 0; 675 } 676 if (pchanged) { 677 u64 pkey = (u64) dd->ipath_pkeys[0] | 678 ((u64) dd->ipath_pkeys[1] << 16) | 679 ((u64) dd->ipath_pkeys[2] << 32) | 680 ((u64) dd->ipath_pkeys[3] << 48); 681 ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, " 682 "new pkey reg %llx\n", pd->port_port, 683 (unsigned long long) oldpkey, 684 (unsigned long long) pkey); 685 ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, 686 pkey); 687 } 688} 689 690/** 691 * ipath_create_user_egr - allocate eager TID buffers 692 * @pd: the port to allocate TID buffers for 693 * 694 * This routine is now quite different for user and kernel, because 695 * the kernel uses skb's, for the accelerated network performance 696 * This is the user port version 697 * 698 * Allocate the eager TID buffers and program them into infinipath 699 * They are no longer completely contiguous, we do multiple allocation 700 * calls. 701 */ 702static int ipath_create_user_egr(struct ipath_portdata *pd) 703{ 704 struct ipath_devdata *dd = pd->port_dd; 705 unsigned e, egrcnt, alloced, egrperchunk, chunk, egrsize, egroff; 706 size_t size; 707 int ret; 708 gfp_t gfp_flags; 709 710 /* 711 * GFP_USER, but without GFP_FS, so buffer cache can be 712 * coalesced (we hope); otherwise, even at order 4, 713 * heavy filesystem activity makes these fail, and we can 714 * use compound pages. 715 */ 716 gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; 717 718 egrcnt = dd->ipath_rcvegrcnt; 719 /* TID number offset for this port */ 720 egroff = pd->port_port * egrcnt; 721 egrsize = dd->ipath_rcvegrbufsize; 722 ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid " 723 "offset %x, egrsize %u\n", egrcnt, egroff, egrsize); 724 725 /* 726 * to avoid wasting a lot of memory, we allocate 32KB chunks of 727 * physically contiguous memory, advance through it until used up 728 * and then allocate more. Of course, we need memory to store those 729 * extra pointers, now. Started out with 256KB, but under heavy 730 * memory pressure (creating large files and then copying them over 731 * NFS while doing lots of MPI jobs), we hit some allocation 732 * failures, even though we can sleep... (2.6.10) Still get 733 * failures at 64K. 32K is the lowest we can go without wasting 734 * additional memory. 735 */ 736 size = 0x8000; 737 alloced = ALIGN(egrsize * egrcnt, size); 738 egrperchunk = size / egrsize; 739 chunk = (egrcnt + egrperchunk - 1) / egrperchunk; 740 pd->port_rcvegrbuf_chunks = chunk; 741 pd->port_rcvegrbufs_perchunk = egrperchunk; 742 pd->port_rcvegrbuf_size = size; 743 pd->port_rcvegrbuf = vmalloc(chunk * sizeof(pd->port_rcvegrbuf[0])); 744 if (!pd->port_rcvegrbuf) { 745 ret = -ENOMEM; 746 goto bail; 747 } 748 pd->port_rcvegrbuf_phys = 749 vmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0])); 750 if (!pd->port_rcvegrbuf_phys) { 751 ret = -ENOMEM; 752 goto bail_rcvegrbuf; 753 } 754 for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { 755 756 pd->port_rcvegrbuf[e] = dma_alloc_coherent( 757 &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e], 758 gfp_flags); 759 760 if (!pd->port_rcvegrbuf[e]) { 761 ret = -ENOMEM; 762 goto bail_rcvegrbuf_phys; 763 } 764 } 765 766 pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0]; 767 768 for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) { 769 dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk]; 770 unsigned i; 771 772 for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { 773 dd->ipath_f_put_tid(dd, e + egroff + 774 (u64 __iomem *) 775 ((char __iomem *) 776 dd->ipath_kregbase + 777 dd->ipath_rcvegrbase), 0, pa); 778 pa += egrsize; 779 } 780 cond_resched(); /* don't hog the cpu */ 781 } 782 783 ret = 0; 784 goto bail; 785 786bail_rcvegrbuf_phys: 787 for (e = 0; e < pd->port_rcvegrbuf_chunks && 788 pd->port_rcvegrbuf[e]; e++) { 789 dma_free_coherent(&dd->pcidev->dev, size, 790 pd->port_rcvegrbuf[e], 791 pd->port_rcvegrbuf_phys[e]); 792 793 } 794 vfree(pd->port_rcvegrbuf_phys); 795 pd->port_rcvegrbuf_phys = NULL; 796bail_rcvegrbuf: 797 vfree(pd->port_rcvegrbuf); 798 pd->port_rcvegrbuf = NULL; 799bail: 800 return ret; 801} 802 803static int ipath_do_user_init(struct ipath_portdata *pd, 804 const struct ipath_user_info *uinfo) 805{ 806 int ret = 0; 807 struct ipath_devdata *dd = pd->port_dd; 808 u32 head32; 809 810 /* for now, if major version is different, bail */ 811 if ((uinfo->spu_userversion >> 16) != IPATH_USER_SWMAJOR) { 812 dev_info(&dd->pcidev->dev, 813 "User major version %d not same as driver " 814 "major %d\n", uinfo->spu_userversion >> 16, 815 IPATH_USER_SWMAJOR); 816 ret = -ENODEV; 817 goto done; 818 } 819 820 if ((uinfo->spu_userversion & 0xffff) != IPATH_USER_SWMINOR) 821 ipath_dbg("User minor version %d not same as driver " 822 "minor %d\n", uinfo->spu_userversion & 0xffff, 823 IPATH_USER_SWMINOR); 824 825 if (uinfo->spu_rcvhdrsize) { 826 ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize); 827 if (ret) 828 goto done; 829 } 830 831 /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */ 832 833 /* for right now, kernel piobufs are at end, so port 1 is at 0 */ 834 pd->port_piobufs = dd->ipath_piobufbase + 835 dd->ipath_pbufsport * (pd->port_port - 836 1) * dd->ipath_palign; 837 ipath_cdbg(VERBOSE, "Set base of piobufs for port %u to 0x%x\n", 838 pd->port_port, pd->port_piobufs); 839 840 /* 841 * Now allocate the rcvhdr Q and eager TIDs; skip the TID 842 * array for time being. If pd->port_port > chip-supported, 843 * we need to do extra stuff here to handle by handling overflow 844 * through port 0, someday 845 */ 846 ret = ipath_create_rcvhdrq(dd, pd); 847 if (!ret) 848 ret = ipath_create_user_egr(pd); 849 if (ret) 850 goto done; 851 852 /* 853 * set the eager head register for this port to the current values 854 * of the tail pointers, since we don't know if they were 855 * updated on last use of the port. 856 */ 857 head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port); 858 ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port); 859 dd->ipath_lastegrheads[pd->port_port] = -1; 860 dd->ipath_lastrcvhdrqtails[pd->port_port] = -1; 861 ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n", 862 pd->port_port, head32); 863 pd->port_tidcursor = 0; /* start at beginning after open */ 864 /* 865 * now enable the port; the tail registers will be written to memory 866 * by the chip as soon as it sees the write to 867 * dd->ipath_kregs->kr_rcvctrl. The update only happens on 868 * transition from 0 to 1, so clear it first, then set it as part of 869 * enabling the port. This will (very briefly) affect any other 870 * open ports, but it shouldn't be long enough to be an issue. 871 * We explictly set the in-memory copy to 0 beforehand, so we don't 872 * have to wait to be sure the DMA update has happened. 873 */ 874 *pd->port_rcvhdrtail_kvaddr = 0ULL; 875 set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, 876 &dd->ipath_rcvctrl); 877 ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, 878 dd->ipath_rcvctrl & ~INFINIPATH_R_TAILUPD); 879 ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, 880 dd->ipath_rcvctrl); 881done: 882 return ret; 883} 884 885 886/* common code for the mappings on dma_alloc_coherent mem */ 887static int ipath_mmap_mem(struct vm_area_struct *vma, 888 struct ipath_portdata *pd, unsigned len, 889 int write_ok, dma_addr_t addr, char *what) 890{ 891 struct ipath_devdata *dd = pd->port_dd; 892 unsigned pfn = (unsigned long)addr >> PAGE_SHIFT; 893 int ret; 894 895 if ((vma->vm_end - vma->vm_start) > len) { 896 dev_info(&dd->pcidev->dev, 897 "FAIL on %s: len %lx > %x\n", what, 898 vma->vm_end - vma->vm_start, len); 899 ret = -EFAULT; 900 goto bail; 901 } 902 903 if (!write_ok) { 904 if (vma->vm_flags & VM_WRITE) { 905 dev_info(&dd->pcidev->dev, 906 "%s must be mapped readonly\n", what); 907 ret = -EPERM; 908 goto bail; 909 } 910 911 /* don't allow them to later change with mprotect */ 912 vma->vm_flags &= ~VM_MAYWRITE; 913 } 914 915 ret = remap_pfn_range(vma, vma->vm_start, pfn, 916 len, vma->vm_page_prot); 917 if (ret) 918 dev_info(&dd->pcidev->dev, 919 "%s port%u mmap of %lx, %x bytes r%c failed: %d\n", 920 what, pd->port_port, (unsigned long)addr, len, 921 write_ok?'w':'o', ret); 922 else 923 ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes r%c\n", 924 what, pd->port_port, (unsigned long)addr, len, 925 write_ok?'w':'o'); 926bail: 927 return ret; 928} 929 930static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd, 931 u64 ureg) 932{ 933 unsigned long phys; 934 int ret; 935 936 /* 937 * This is real hardware, so use io_remap. This is the mechanism 938 * for the user process to update the head registers for their port 939 * in the chip. 940 */ 941 if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { 942 dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen " 943 "%lx > PAGE\n", vma->vm_end - vma->vm_start); 944 ret = -EFAULT; 945 } else { 946 phys = dd->ipath_physaddr + ureg; 947 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 948 949 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; 950 ret = io_remap_pfn_range(vma, vma->vm_start, 951 phys >> PAGE_SHIFT, 952 vma->vm_end - vma->vm_start, 953 vma->vm_page_prot); 954 } 955 return ret; 956} 957 958static int mmap_piobufs(struct vm_area_struct *vma, 959 struct ipath_devdata *dd, 960 struct ipath_portdata *pd) 961{ 962 unsigned long phys; 963 int ret; 964 965 /* 966 * When we map the PIO buffers in the chip, we want to map them as 967 * writeonly, no read possible. This prevents access to previous 968 * process data, and catches users who might try to read the i/o 969 * space due to a bug. 970 */ 971 if ((vma->vm_end - vma->vm_start) > 972 (dd->ipath_pbufsport * dd->ipath_palign)) { 973 dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: " 974 "reqlen %lx > PAGE\n", 975 vma->vm_end - vma->vm_start); 976 ret = -EFAULT; 977 goto bail; 978 } 979 980 phys = dd->ipath_physaddr + pd->port_piobufs; 981 982 /* 983 * Don't mark this as non-cached, or we don't get the 984 * write combining behavior we want on the PIO buffers! 985 */ 986 987#if defined(__powerpc__) 988 /* There isn't a generic way to specify writethrough mappings */ 989 pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; 990 pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU; 991 pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED; 992#endif 993 994 /* 995 * don't allow them to later change to readable with mprotect (for when 996 * not initially mapped readable, as is normally the case) 997 */ 998 vma->vm_flags &= ~VM_MAYREAD; 999 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; 1000 1001 ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, 1002 vma->vm_end - vma->vm_start, 1003 vma->vm_page_prot); 1004bail: 1005 return ret; 1006} 1007 1008static int mmap_rcvegrbufs(struct vm_area_struct *vma, 1009 struct ipath_portdata *pd) 1010{ 1011 struct ipath_devdata *dd = pd->port_dd; 1012 unsigned long start, size; 1013 size_t total_size, i; 1014 dma_addr_t *phys; 1015 int ret; 1016 1017 size = pd->port_rcvegrbuf_size; 1018 total_size = pd->port_rcvegrbuf_chunks * size; 1019 if ((vma->vm_end - vma->vm_start) > total_size) { 1020 dev_info(&dd->pcidev->dev, "FAIL on egr bufs: " 1021 "reqlen %lx > actual %lx\n", 1022 vma->vm_end - vma->vm_start, 1023 (unsigned long) total_size); 1024 ret = -EFAULT; 1025 goto bail; 1026 } 1027 1028 if (vma->vm_flags & VM_WRITE) { 1029 dev_info(&dd->pcidev->dev, "Can't map eager buffers as " 1030 "writable (flags=%lx)\n", vma->vm_flags); 1031 ret = -EPERM; 1032 goto bail; 1033 } 1034 /* don't allow them to later change to writeable with mprotect */ 1035 vma->vm_flags &= ~VM_MAYWRITE; 1036 1037 start = vma->vm_start; 1038 phys = pd->port_rcvegrbuf_phys; 1039 1040 for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) { 1041 ret = remap_pfn_range(vma, start, phys[i] >> PAGE_SHIFT, 1042 size, vma->vm_page_prot); 1043 if (ret < 0) 1044 goto bail; 1045 } 1046 ret = 0; 1047 1048bail: 1049 return ret; 1050} 1051 1052/** 1053 * ipath_mmap - mmap various structures into user space 1054 * @fp: the file pointer 1055 * @vma: the VM area 1056 * 1057 * We use this to have a shared buffer between the kernel and the user code 1058 * for the rcvhdr queue, egr buffers, and the per-port user regs and pio 1059 * buffers in the chip. We have the open and close entries so we can bump 1060 * the ref count and keep the driver from being unloaded while still mapped. 1061 */ 1062static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) 1063{ 1064 struct ipath_portdata *pd; 1065 struct ipath_devdata *dd; 1066 u64 pgaddr, ureg; 1067 int ret; 1068 1069 pd = port_fp(fp); 1070 dd = pd->port_dd; 1071 1072 /* 1073 * This is the ipath_do_user_init() code, mapping the shared buffers 1074 * into the user process. The address referred to by vm_pgoff is the 1075 * virtual, not physical, address; we only do one mmap for each 1076 * space mapped. 1077 */ 1078 pgaddr = vma->vm_pgoff << PAGE_SHIFT; 1079 1080 /* 1081 * Must fit in 40 bits for our hardware; some checked elsewhere, 1082 * but we'll be paranoid. Check for 0 is mostly in case one of the 1083 * allocations failed, but user called mmap anyway. We want to catch 1084 * that before it can match. 1085 */ 1086 if (!pgaddr || pgaddr >= (1ULL<<40)) { 1087 ipath_dev_err(dd, "Bad phys addr %llx, start %lx, end %lx\n", 1088 (unsigned long long)pgaddr, vma->vm_start, vma->vm_end); 1089 return -EINVAL; 1090 } 1091 1092 /* just the offset of the port user registers, not physical addr */ 1093 ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port; 1094 1095 ipath_cdbg(MM, "ushare: pgaddr %llx vm_start=%lx, vmlen %lx\n", 1096 (unsigned long long) pgaddr, vma->vm_start, 1097 vma->vm_end - vma->vm_start); 1098 1099 if (vma->vm_start & (PAGE_SIZE-1)) { 1100 ipath_dev_err(dd, 1101 "vm_start not aligned: %lx, end=%lx phys %lx\n", 1102 vma->vm_start, vma->vm_end, (unsigned long)pgaddr); 1103 ret = -EINVAL; 1104 } 1105 else if (pgaddr == ureg) 1106 ret = mmap_ureg(vma, dd, ureg); 1107 else if (pgaddr == pd->port_piobufs) 1108 ret = mmap_piobufs(vma, dd, pd); 1109 else if (pgaddr == (u64) pd->port_rcvegr_phys) 1110 ret = mmap_rcvegrbufs(vma, pd); 1111 else if (pgaddr == (u64) pd->port_rcvhdrq_phys) { 1112 /* 1113 * The rcvhdrq itself; readonly except on HT (so have 1114 * to allow writable mapping), multiple pages, contiguous 1115 * from an i/o perspective. 1116 */ 1117 unsigned total_size = 1118 ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize 1119 * sizeof(u32), PAGE_SIZE); 1120 ret = ipath_mmap_mem(vma, pd, total_size, 1, 1121 pd->port_rcvhdrq_phys, 1122 "rcvhdrq"); 1123 } 1124 else if (pgaddr == (u64)pd->port_rcvhdrqtailaddr_phys) 1125 /* in-memory copy of rcvhdrq tail register */ 1126 ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, 1127 pd->port_rcvhdrqtailaddr_phys, 1128 "rcvhdrq tail"); 1129 else if (pgaddr == dd->ipath_pioavailregs_phys) 1130 /* in-memory copy of pioavail registers */ 1131 ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, 1132 dd->ipath_pioavailregs_phys, 1133 "pioavail registers"); 1134 else 1135 ret = -EINVAL; 1136 1137 vma->vm_private_data = NULL; 1138 1139 if (ret < 0) 1140 dev_info(&dd->pcidev->dev, 1141 "Failure %d on addr %lx, off %lx\n", 1142 -ret, vma->vm_start, vma->vm_pgoff); 1143 1144 return ret; 1145} 1146 1147static unsigned int ipath_poll(struct file *fp, 1148 struct poll_table_struct *pt) 1149{ 1150 struct ipath_portdata *pd; 1151 u32 head, tail; 1152 int bit; 1153 unsigned pollflag = 0; 1154 struct ipath_devdata *dd; 1155 1156 pd = port_fp(fp); 1157 dd = pd->port_dd; 1158 1159 bit = pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT; 1160 set_bit(bit, &dd->ipath_rcvctrl); 1161 1162 /* 1163 * Before blocking, make sure that head is still == tail, 1164 * reading from the chip, so we can be sure the interrupt 1165 * enable has made it to the chip. If not equal, disable 1166 * interrupt again and return immediately. This avoids races, 1167 * and the overhead of the chip read doesn't matter much at 1168 * this point, since we are waiting for something anyway. 1169 */ 1170 1171 ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, 1172 dd->ipath_rcvctrl); 1173 1174 head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port); 1175 tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); 1176 1177 if (tail == head) { 1178 set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); 1179 if(dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */ 1180 (void)ipath_write_ureg(dd, ur_rcvhdrhead, 1181 dd->ipath_rhdrhead_intr_off 1182 | head, pd->port_port); 1183 poll_wait(fp, &pd->port_wait, pt); 1184 1185 if (test_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag)) { 1186 /* timed out, no packets received */ 1187 clear_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); 1188 pd->port_rcvwait_to++; 1189 } 1190 else 1191 pollflag = POLLIN | POLLRDNORM; 1192 } 1193 else { 1194 /* it's already happened; don't do wait_event overhead */ 1195 pollflag = POLLIN | POLLRDNORM; 1196 pd->port_rcvnowait++; 1197 } 1198 1199 clear_bit(bit, &dd->ipath_rcvctrl); 1200 ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, 1201 dd->ipath_rcvctrl); 1202 1203 return pollflag; 1204} 1205 1206static int try_alloc_port(struct ipath_devdata *dd, int port, 1207 struct file *fp) 1208{ 1209 int ret; 1210 1211 if (!dd->ipath_pd[port]) { 1212 void *p, *ptmp; 1213 1214 p = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL); 1215 1216 /* 1217 * Allocate memory for use in ipath_tid_update() just once 1218 * at open, not per call. Reduces cost of expected send 1219 * setup. 1220 */ 1221 ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) + 1222 dd->ipath_rcvtidcnt * sizeof(struct page **), 1223 GFP_KERNEL); 1224 if (!p || !ptmp) { 1225 ipath_dev_err(dd, "Unable to allocate portdata " 1226 "memory, failing open\n"); 1227 ret = -ENOMEM; 1228 kfree(p); 1229 kfree(ptmp); 1230 goto bail; 1231 } 1232 dd->ipath_pd[port] = p; 1233 dd->ipath_pd[port]->port_port = port; 1234 dd->ipath_pd[port]->port_dd = dd; 1235 dd->ipath_pd[port]->port_tid_pg_list = ptmp; 1236 init_waitqueue_head(&dd->ipath_pd[port]->port_wait); 1237 } 1238 if (!dd->ipath_pd[port]->port_cnt) { 1239 dd->ipath_pd[port]->port_cnt = 1; 1240 fp->private_data = (void *) dd->ipath_pd[port]; 1241 ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n", 1242 current->comm, current->pid, dd->ipath_unit, 1243 port); 1244 dd->ipath_pd[port]->port_pid = current->pid; 1245 strncpy(dd->ipath_pd[port]->port_comm, current->comm, 1246 sizeof(dd->ipath_pd[port]->port_comm)); 1247 ipath_stats.sps_ports++; 1248 ret = 0; 1249 goto bail; 1250 } 1251 ret = -EBUSY; 1252 1253bail: 1254 return ret; 1255} 1256 1257static inline int usable(struct ipath_devdata *dd) 1258{ 1259 return dd && 1260 (dd->ipath_flags & IPATH_PRESENT) && 1261 dd->ipath_kregbase && 1262 dd->ipath_lid && 1263 !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED 1264 | IPATH_LINKUNK)); 1265} 1266 1267static int find_free_port(int unit, struct file *fp) 1268{ 1269 struct ipath_devdata *dd = ipath_lookup(unit); 1270 int ret, i; 1271 1272 if (!dd) { 1273 ret = -ENODEV; 1274 goto bail; 1275 } 1276 1277 if (!usable(dd)) { 1278 ret = -ENETDOWN; 1279 goto bail; 1280 } 1281 1282 for (i = 0; i < dd->ipath_cfgports; i++) { 1283 ret = try_alloc_port(dd, i, fp); 1284 if (ret != -EBUSY) 1285 goto bail; 1286 } 1287 ret = -EBUSY; 1288 1289bail: 1290 return ret; 1291} 1292 1293static int find_best_unit(struct file *fp) 1294{ 1295 int ret = 0, i, prefunit = -1, devmax; 1296 int maxofallports, npresent, nup; 1297 int ndev; 1298 1299 (void) ipath_count_units(&npresent, &nup, &maxofallports); 1300 1301 /* 1302 * This code is present to allow a knowledgeable person to 1303 * specify the layout of processes to processors before opening 1304 * this driver, and then we'll assign the process to the "closest" 1305 * InfiniPath chip to that processor (we assume reasonable connectivity, 1306 * for now). This code assumes that if affinity has been set 1307 * before this point, that at most one cpu is set; for now this 1308 * is reasonable. I check for both cpus_empty() and cpus_full(), 1309 * in case some kernel variant sets none of the bits when no 1310 * affinity is set. 2.6.11 and 12 kernels have all present 1311 * cpus set. Some day we'll have to fix it up further to handle 1312 * a cpu subset. This algorithm fails for two HT chips connected 1313 * in tunnel fashion. Eventually this needs real topology 1314 * information. There may be some issues with dual core numbering 1315 * as well. This needs more work prior to release. 1316 */ 1317 if (!cpus_empty(current->cpus_allowed) && 1318 !cpus_full(current->cpus_allowed)) { 1319 int ncpus = num_online_cpus(), curcpu = -1; 1320 for (i = 0; i < ncpus; i++) 1321 if (cpu_isset(i, current->cpus_allowed)) { 1322 ipath_cdbg(PROC, "%s[%u] affinity set for " 1323 "cpu %d\n", current->comm, 1324 current->pid, i); 1325 curcpu = i; 1326 } 1327 if (curcpu != -1) { 1328 if (npresent) { 1329 prefunit = curcpu / (ncpus / npresent); 1330 ipath_dbg("%s[%u] %d chips, %d cpus, " 1331 "%d cpus/chip, select unit %d\n", 1332 current->comm, current->pid, 1333 npresent, ncpus, ncpus / npresent, 1334 prefunit); 1335 } 1336 } 1337 } 1338 1339 /* 1340 * user ports start at 1, kernel port is 0 1341 * For now, we do round-robin access across all chips 1342 */ 1343 1344 if (prefunit != -1) 1345 devmax = prefunit + 1; 1346 else 1347 devmax = ipath_count_units(NULL, NULL, NULL); 1348recheck: 1349 for (i = 1; i < maxofallports; i++) { 1350 for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax; 1351 ndev++) { 1352 struct ipath_devdata *dd = ipath_lookup(ndev); 1353 1354 if (!usable(dd)) 1355 continue; /* can't use this unit */ 1356 if (i >= dd->ipath_cfgports) 1357 /* 1358 * Maxed out on users of this unit. Try 1359 * next. 1360 */ 1361 continue; 1362 ret = try_alloc_port(dd, i, fp); 1363 if (!ret) 1364 goto done; 1365 } 1366 } 1367 1368 if (npresent) { 1369 if (nup == 0) { 1370 ret = -ENETDOWN; 1371 ipath_dbg("No ports available (none initialized " 1372 "and ready)\n"); 1373 } else { 1374 if (prefunit > 0) { 1375 /* if started above 0, retry from 0 */ 1376 ipath_cdbg(PROC, 1377 "%s[%u] no ports on prefunit " 1378 "%d, clear and re-check\n", 1379 current->comm, current->pid, 1380 prefunit); 1381 devmax = ipath_count_units(NULL, NULL, 1382 NULL); 1383 prefunit = -1; 1384 goto recheck; 1385 } 1386 ret = -EBUSY; 1387 ipath_dbg("No ports available\n"); 1388 } 1389 } else { 1390 ret = -ENXIO; 1391 ipath_dbg("No boards found\n"); 1392 } 1393 1394done: 1395 return ret; 1396} 1397 1398static int ipath_open(struct inode *in, struct file *fp) 1399{ 1400 int ret, user_minor; 1401 1402 mutex_lock(&ipath_mutex); 1403 1404 user_minor = iminor(in) - IPATH_USER_MINOR_BASE; 1405 ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n", 1406 (long)in->i_rdev, user_minor); 1407 1408 if (user_minor) 1409 ret = find_free_port(user_minor - 1, fp); 1410 else 1411 ret = find_best_unit(fp); 1412 1413 mutex_unlock(&ipath_mutex); 1414 return ret; 1415} 1416 1417/** 1418 * unlock_exptid - unlock any expected TID entries port still had in use 1419 * @pd: port 1420 * 1421 * We don't actually update the chip here, because we do a bulk update 1422 * below, using ipath_f_clear_tids. 1423 */ 1424static void unlock_expected_tids(struct ipath_portdata *pd) 1425{ 1426 struct ipath_devdata *dd = pd->port_dd; 1427 int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt; 1428 int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt; 1429 1430 ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n", 1431 pd->port_port); 1432 for (i = port_tidbase; i < maxtid; i++) { 1433 if (!dd->ipath_pageshadow[i]) 1434 continue; 1435 1436 ipath_release_user_pages_on_close(&dd->ipath_pageshadow[i], 1437 1); 1438 dd->ipath_pageshadow[i] = NULL; 1439 cnt++; 1440 ipath_stats.sps_pageunlocks++; 1441 } 1442 if (cnt) 1443 ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n", 1444 pd->port_port, cnt); 1445 1446 if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks) 1447 ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n", 1448 (unsigned long long) ipath_stats.sps_pagelocks, 1449 (unsigned long long) 1450 ipath_stats.sps_pageunlocks); 1451} 1452 1453static int ipath_close(struct inode *in, struct file *fp) 1454{ 1455 int ret = 0; 1456 struct ipath_portdata *pd; 1457 struct ipath_devdata *dd; 1458 unsigned port; 1459 1460 ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n", 1461 (long)in->i_rdev, fp->private_data); 1462 1463 mutex_lock(&ipath_mutex); 1464 1465 pd = port_fp(fp); 1466 port = pd->port_port; 1467 fp->private_data = NULL; 1468 dd = pd->port_dd; 1469 1470 if (pd->port_hdrqfull) { 1471 ipath_cdbg(PROC, "%s[%u] had %u rcvhdrqfull errors " 1472 "during run\n", pd->port_comm, pd->port_pid, 1473 pd->port_hdrqfull); 1474 pd->port_hdrqfull = 0; 1475 } 1476 1477 if (pd->port_rcvwait_to || pd->port_piowait_to 1478 || pd->port_rcvnowait || pd->port_pionowait) { 1479 ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; " 1480 "%u rcv %u, pio already\n", 1481 pd->port_port, pd->port_rcvwait_to, 1482 pd->port_piowait_to, pd->port_rcvnowait, 1483 pd->port_pionowait); 1484 pd->port_rcvwait_to = pd->port_piowait_to = 1485 pd->port_rcvnowait = pd->port_pionowait = 0; 1486 } 1487 if (pd->port_flag) { 1488 ipath_dbg("port %u port_flag still set to 0x%lx\n", 1489 pd->port_port, pd->port_flag); 1490 pd->port_flag = 0; 1491 } 1492 1493 if (dd->ipath_kregbase) { 1494 int i; 1495 /* atomically clear receive enable port. */ 1496 clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + port, 1497 &dd->ipath_rcvctrl); 1498 ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl, 1499 dd->ipath_rcvctrl); 1500 /* and read back from chip to be sure that nothing 1501 * else is in flight when we do the rest */ 1502 (void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); 1503 1504 /* clean up the pkeys for this port user */ 1505 ipath_clean_part_key(pd, dd); 1506 1507 1508 /* 1509 * be paranoid, and never write 0's to these, just use an 1510 * unused part of the port 0 tail page. Of course, 1511 * rcvhdraddr points to a large chunk of memory, so this 1512 * could still trash things, but at least it won't trash 1513 * page 0, and by disabling the port, it should stop "soon", 1514 * even if a packet or two is in already in flight after we 1515 * disabled the port. 1516 */ 1517 ipath_write_kreg_port(dd, 1518 dd->ipath_kregs->kr_rcvhdrtailaddr, port, 1519 dd->ipath_dummy_hdrq_phys); 1520 ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, 1521 pd->port_port, dd->ipath_dummy_hdrq_phys); 1522 1523 i = dd->ipath_pbufsport * (port - 1); 1524 ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport); 1525 1526 if (dd->ipath_pageshadow) 1527 unlock_expected_tids(pd); 1528 ipath_stats.sps_ports--; 1529 ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n", 1530 pd->port_comm, pd->port_pid, 1531 dd->ipath_unit, port); 1532 1533 dd->ipath_f_clear_tids(dd, pd->port_port); 1534 } 1535 1536 pd->port_cnt = 0; 1537 pd->port_pid = 0; 1538 1539 dd->ipath_pd[pd->port_port] = NULL; /* before releasing mutex */ 1540 mutex_unlock(&ipath_mutex); 1541 ipath_free_pddata(dd, pd); /* after releasing the mutex */ 1542 1543 return ret; 1544} 1545 1546static int ipath_port_info(struct ipath_portdata *pd, 1547 struct ipath_port_info __user *uinfo) 1548{ 1549 struct ipath_port_info info; 1550 int nup; 1551 int ret; 1552 1553 (void) ipath_count_units(NULL, &nup, NULL); 1554 info.num_active = nup; 1555 info.unit = pd->port_dd->ipath_unit; 1556 info.port = pd->port_port; 1557 1558 if (copy_to_user(uinfo, &info, sizeof(info))) { 1559 ret = -EFAULT; 1560 goto bail; 1561 } 1562 ret = 0; 1563 1564bail: 1565 return ret; 1566} 1567 1568static ssize_t ipath_write(struct file *fp, const char __user *data, 1569 size_t count, loff_t *off) 1570{ 1571 const struct ipath_cmd __user *ucmd; 1572 struct ipath_portdata *pd; 1573 const void __user *src; 1574 size_t consumed, copy; 1575 struct ipath_cmd cmd; 1576 ssize_t ret = 0; 1577 void *dest; 1578 1579 if (count < sizeof(cmd.type)) { 1580 ret = -EINVAL; 1581 goto bail; 1582 } 1583 1584 ucmd = (const struct ipath_cmd __user *) data; 1585 1586 if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) { 1587 ret = -EFAULT; 1588 goto bail; 1589 } 1590 1591 consumed = sizeof(cmd.type); 1592 1593 switch (cmd.type) { 1594 case IPATH_CMD_USER_INIT: 1595 copy = sizeof(cmd.cmd.user_info); 1596 dest = &cmd.cmd.user_info; 1597 src = &ucmd->cmd.user_info; 1598 break; 1599 case IPATH_CMD_RECV_CTRL: 1600 copy = sizeof(cmd.cmd.recv_ctrl); 1601 dest = &cmd.cmd.recv_ctrl; 1602 src = &ucmd->cmd.recv_ctrl; 1603 break; 1604 case IPATH_CMD_PORT_INFO: 1605 copy = sizeof(cmd.cmd.port_info); 1606 dest = &cmd.cmd.port_info; 1607 src = &ucmd->cmd.port_info; 1608 break; 1609 case IPATH_CMD_TID_UPDATE: 1610 case IPATH_CMD_TID_FREE: 1611 copy = sizeof(cmd.cmd.tid_info); 1612 dest = &cmd.cmd.tid_info; 1613 src = &ucmd->cmd.tid_info; 1614 break; 1615 case IPATH_CMD_SET_PART_KEY: 1616 copy = sizeof(cmd.cmd.part_key); 1617 dest = &cmd.cmd.part_key; 1618 src = &ucmd->cmd.part_key; 1619 break; 1620 default: 1621 ret = -EINVAL; 1622 goto bail; 1623 } 1624 1625 if ((count - consumed) < copy) { 1626 ret = -EINVAL; 1627 goto bail; 1628 } 1629 1630 if (copy_from_user(dest, src, copy)) { 1631 ret = -EFAULT; 1632 goto bail; 1633 } 1634 1635 consumed += copy; 1636 pd = port_fp(fp); 1637 1638 switch (cmd.type) { 1639 case IPATH_CMD_USER_INIT: 1640 ret = ipath_do_user_init(pd, &cmd.cmd.user_info); 1641 if (ret < 0) 1642 goto bail; 1643 ret = ipath_get_base_info( 1644 pd, (void __user *) (unsigned long) 1645 cmd.cmd.user_info.spu_base_info, 1646 cmd.cmd.user_info.spu_base_info_size); 1647 break; 1648 case IPATH_CMD_RECV_CTRL: 1649 ret = ipath_manage_rcvq(pd, cmd.cmd.recv_ctrl); 1650 break; 1651 case IPATH_CMD_PORT_INFO: 1652 ret = ipath_port_info(pd, 1653 (struct ipath_port_info __user *) 1654 (unsigned long) cmd.cmd.port_info); 1655 break; 1656 case IPATH_CMD_TID_UPDATE: 1657 ret = ipath_tid_update(pd, &cmd.cmd.tid_info); 1658 break; 1659 case IPATH_CMD_TID_FREE: 1660 ret = ipath_tid_free(pd, &cmd.cmd.tid_info); 1661 break; 1662 case IPATH_CMD_SET_PART_KEY: 1663 ret = ipath_set_part_key(pd, cmd.cmd.part_key); 1664 break; 1665 } 1666 1667 if (ret >= 0) 1668 ret = consumed; 1669 1670bail: 1671 return ret; 1672} 1673 1674static struct class *ipath_class; 1675 1676static int init_cdev(int minor, char *name, struct file_operations *fops, 1677 struct cdev **cdevp, struct class_device **class_devp) 1678{ 1679 const dev_t dev = MKDEV(IPATH_MAJOR, minor); 1680 struct cdev *cdev = NULL; 1681 struct class_device *class_dev = NULL; 1682 int ret; 1683 1684 cdev = cdev_alloc(); 1685 if (!cdev) { 1686 printk(KERN_ERR IPATH_DRV_NAME 1687 ": Could not allocate cdev for minor %d, %s\n", 1688 minor, name); 1689 ret = -ENOMEM; 1690 goto done; 1691 } 1692 1693 cdev->owner = THIS_MODULE; 1694 cdev->ops = fops; 1695 kobject_set_name(&cdev->kobj, name); 1696 1697 ret = cdev_add(cdev, dev, 1); 1698 if (ret < 0) { 1699 printk(KERN_ERR IPATH_DRV_NAME 1700 ": Could not add cdev for minor %d, %s (err %d)\n", 1701 minor, name, -ret); 1702 goto err_cdev; 1703 } 1704 1705 class_dev = class_device_create(ipath_class, NULL, dev, NULL, name); 1706 1707 if (IS_ERR(class_dev)) { 1708 ret = PTR_ERR(class_dev); 1709 printk(KERN_ERR IPATH_DRV_NAME ": Could not create " 1710 "class_dev for minor %d, %s (err %d)\n", 1711 minor, name, -ret); 1712 goto err_cdev; 1713 } 1714 1715 goto done; 1716 1717err_cdev: 1718 cdev_del(cdev); 1719 cdev = NULL; 1720 1721done: 1722 if (ret >= 0) { 1723 *cdevp = cdev; 1724 *class_devp = class_dev; 1725 } else { 1726 *cdevp = NULL; 1727 *class_devp = NULL; 1728 } 1729 1730 return ret; 1731} 1732 1733int ipath_cdev_init(int minor, char *name, struct file_operations *fops, 1734 struct cdev **cdevp, struct class_device **class_devp) 1735{ 1736 return init_cdev(minor, name, fops, cdevp, class_devp); 1737} 1738 1739static void cleanup_cdev(struct cdev **cdevp, 1740 struct class_device **class_devp) 1741{ 1742 struct class_device *class_dev = *class_devp; 1743 1744 if (class_dev) { 1745 class_device_unregister(class_dev); 1746 *class_devp = NULL; 1747 } 1748 1749 if (*cdevp) { 1750 cdev_del(*cdevp); 1751 *cdevp = NULL; 1752 } 1753} 1754 1755void ipath_cdev_cleanup(struct cdev **cdevp, 1756 struct class_device **class_devp) 1757{ 1758 cleanup_cdev(cdevp, class_devp); 1759} 1760 1761static struct cdev *wildcard_cdev; 1762static struct class_device *wildcard_class_dev; 1763 1764static const dev_t dev = MKDEV(IPATH_MAJOR, 0); 1765 1766static int user_init(void) 1767{ 1768 int ret; 1769 1770 ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME); 1771 if (ret < 0) { 1772 printk(KERN_ERR IPATH_DRV_NAME ": Could not register " 1773 "chrdev region (err %d)\n", -ret); 1774 goto done; 1775 } 1776 1777 ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME); 1778 1779 if (IS_ERR(ipath_class)) { 1780 ret = PTR_ERR(ipath_class); 1781 printk(KERN_ERR IPATH_DRV_NAME ": Could not create " 1782 "device class (err %d)\n", -ret); 1783 goto bail; 1784 } 1785 1786 goto done; 1787bail: 1788 unregister_chrdev_region(dev, IPATH_NMINORS); 1789done: 1790 return ret; 1791} 1792 1793static void user_cleanup(void) 1794{ 1795 if (ipath_class) { 1796 class_destroy(ipath_class); 1797 ipath_class = NULL; 1798 } 1799 1800 unregister_chrdev_region(dev, IPATH_NMINORS); 1801} 1802 1803static atomic_t user_count = ATOMIC_INIT(0); 1804static atomic_t user_setup = ATOMIC_INIT(0); 1805 1806int ipath_user_add(struct ipath_devdata *dd) 1807{ 1808 char name[10]; 1809 int ret; 1810 1811 if (atomic_inc_return(&user_count) == 1) { 1812 ret = user_init(); 1813 if (ret < 0) { 1814 ipath_dev_err(dd, "Unable to set up user support: " 1815 "error %d\n", -ret); 1816 goto bail; 1817 } 1818 ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev, 1819 &wildcard_class_dev); 1820 if (ret < 0) { 1821 ipath_dev_err(dd, "Could not create wildcard " 1822 "minor: error %d\n", -ret); 1823 goto bail_user; 1824 } 1825 1826 atomic_set(&user_setup, 1); 1827 } 1828 1829 snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit); 1830 1831 ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops, 1832 &dd->user_cdev, &dd->user_class_dev); 1833 if (ret < 0) 1834 ipath_dev_err(dd, "Could not create user minor %d, %s\n", 1835 dd->ipath_unit + 1, name); 1836 1837 goto bail; 1838 1839bail_user: 1840 user_cleanup(); 1841bail: 1842 return ret; 1843} 1844 1845void ipath_user_remove(struct ipath_devdata *dd) 1846{ 1847 cleanup_cdev(&dd->user_cdev, &dd->user_class_dev); 1848 1849 if (atomic_dec_return(&user_count) == 0) { 1850 if (atomic_read(&user_setup) == 0) 1851 goto bail; 1852 1853 cleanup_cdev(&wildcard_cdev, &wildcard_class_dev); 1854 user_cleanup(); 1855 1856 atomic_set(&user_setup, 0); 1857 } 1858bail: 1859 return; 1860} 1861 1862