Kernel Study

Learn about kernel

/dev/kvm

leave a comment »

I’ll show the very basic path into the KVM from the User Space. If you loaded ‘kvm’ module, you can find kvm as a device node, usually in /dev/ directory. This character device node provides the way to control Virtualization features such as checking the ability, creating VM, and destroying VM.

While kvm module loading, it registers character device as a misc device.

        r = misc_register(&kvm_dev);

static struct file_operations kvm_chardev_ops = {
        .unlocked_ioctl = kvm_dev_ioctl,
        .compat_ioctl   = kvm_dev_ioctl,
};

static struct miscdevice kvm_dev = {
        KVM_MINOR,
        "kvm",
        &kvm_chardev_ops,
};

As you can see here, they only implemented ioctl call. It means that you can’t find anything useful with general read/write call or redirection on the command line. You always have to write an application to use kvm’s feature.

kvm_dev_ioctl function is implemented as following:


static long kvm_dev_ioctl(struct file *filp,
                          unsigned int ioctl, unsigned long arg)
{
        long r = -EINVAL;

        switch (ioctl) {
        case KVM_GET_API_VERSION:
                r = -EINVAL;
                if (arg)
                        goto out;
                r = KVM_API_VERSION;
                break;
        case KVM_CREATE_VM:
                r = -EINVAL;
                if (arg)
                        goto out;
                r = kvm_dev_ioctl_create_vm();
                break;
        case KVM_CHECK_EXTENSION:
                r = kvm_dev_ioctl_check_extension_generic(arg);
                break;
        case KVM_GET_VCPU_MMAP_SIZE:
                r = -EINVAL;
                if (arg)
                        goto out;
                r = PAGE_SIZE;     /* struct kvm_run */
#ifdef CONFIG_X86
                r += PAGE_SIZE;    /* pio data page */
#endif
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
                r += PAGE_SIZE;    /* coalesced mmio ring page */
#endif
                break;
        case KVM_TRACE_ENABLE:
        case KVM_TRACE_PAUSE:
        case KVM_TRACE_DISABLE:
                r = kvm_trace_ioctl(ioctl, arg);
                break;
        default:
                return kvm_arch_dev_ioctl(filp, ioctl, arg);
        }
out:
        return r;
}

There are 7 predefined commands. If it is not recognized by this function, it goes into kvm_arch_dev_ioctl() function which do additional check with actual hardware module such as kvm_intel and kvm_adm.

KVM_GET_API_VERSION
It returns the version of KVM. In kernel 2.6.30, it is 12 (defined in kvm.h as a KVM_API_VERSION).

KVM_CREATE_VM

KVM_CHECK_EXTENSION

KVM_GET_VCPU_MMAP_SIZE

KVM_TRACE_ENABLE
KVM_TRACE_PAUSE
KVM_TRACE_DISABLE

Written by Sungju

June 11, 2009 at 6:46 am

Posted in Virtualization

Tagged with

How kernel determind the low and high memory zone?

leave a comment »

I thought Linux fixed highmem zone and low mem zone at the compile time. But, I was wrong. I even found the command line parameter named ‘highmem=’ which can set the highmem size.

Actually, this calculation is done during the initialization and following is the code which actually do the thing.

 895/*
 896 * Determine low and high memory ranges:
 897 */
 898unsigned long __init find_max_low_pfn(void)
 899{
 900        unsigned long max_low_pfn;
 901
 902        max_low_pfn = max_pfn;
 903        if (max_low_pfn > MAXMEM_PFN) {
 904                if (highmem_pages == -1)
 905                        highmem_pages = max_pfn - MAXMEM_PFN;
 906                if (highmem_pages + MAXMEM_PFN < max_pfn)
 907                        max_pfn = MAXMEM_PFN + highmem_pages;
 908                if (highmem_pages + MAXMEM_PFN > max_pfn) {
 909                        printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
 910                        highmem_pages = 0;
 911                }
 912                max_low_pfn = MAXMEM_PFN;
 913#ifndef CONFIG_HIGHMEM
 914                /* Maximum memory usable is what is directly addressable */
 915                printk(KERN_WARNING "Warning only %ldMB will be used.\n",
 916                                        MAXMEM>>20);
 917                if (max_pfn > MAX_NONPAE_PFN)
 918                        printk(KERN_WARNING "Use a PAE enabled kernel.\n");
 919                else
 920                        printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
 921                max_pfn = MAXMEM_PFN;
 922#else /* !CONFIG_HIGHMEM */
 923#ifndef CONFIG_X86_PAE
 924                if (max_pfn > MAX_NONPAE_PFN) {
 925                        max_pfn = MAX_NONPAE_PFN;
 926                        printk(KERN_WARNING "Warning only 4GB will be used.\n");
 927                        printk(KERN_WARNING "Use a PAE enabled kernel.\n");
 928                }
 929#endif /* !CONFIG_X86_PAE */
 930#endif /* !CONFIG_HIGHMEM */
 931        } else {
 932                if (highmem_pages == -1)
 933                        highmem_pages = 0;
 934#ifdef CONFIG_HIGHMEM
 935                if (highmem_pages >= max_pfn) {
 936                        printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
 937                        highmem_pages = 0;
 938                }
 939                if (highmem_pages) {
 940                        if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
 941                                printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
 942                                highmem_pages = 0;
 943                        }
 944                        max_low_pfn -= highmem_pages;
 945                }
 946#else
 947                if (highmem_pages)
 948                        printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
 949#endif
 950        }
 951        return max_low_pfn;
 952}

Written by Sungju

March 24, 2009 at 5:10 am

Posted in Memory

how linux get maximum pfn number?

leave a comment »

I wondered where linux get information about memory capacity in the machine. In i386 machine, we can get information from BIOS or e820 which is shorthand for BIOS function name. You can find some information from whikipedia : http://en.wikipedia.org/wiki/E820

Anyway, after checking basic memory map information, kernel call following function to get max_pfn value.

 868/*
 869 * Find the highest page frame number we have available
 870 */
 871void __init find_max_pfn(void)
 872{
 873        int i;
 874
 875        max_pfn = 0;
 876        if (efi_enabled) {
 877                efi_memmap_walk(efi_find_max_pfn, &max_pfn);
 878                return;
 879        }
 880
 881        for (i = 0; i < e820.nr_map; i++) {
 882                unsigned long start, end;
 883                /* RAM? */
 884                if (e820.map[i].type != E820_RAM)
 885                        continue;
 886                start = PFN_UP(e820.map[i].addr);
 887                end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
 888                if (start >= end)
 889                        continue;
 890                if (end > max_pfn)
 891                        max_pfn = end;
 892        }
 893}

Written by Sungju

March 24, 2009 at 4:52 am

Posted in Memory

Where can I find kernel cmdline parameter parsing code?

leave a comment »

I was wondering what highmem parameter affect to the system, so I tried to dig into the kernel code. But, I had to spend so many hours to find the starting point.

This is the code, I finally found and I want to keep it here for later use.

http://lxr.linux.no/linux+v2.6.11/arch/i386/kernel/setup.c

 668static void __init parse_cmdline_early (char ** cmdline_p)
 669{
 670        char c = ' ', *to = command_line, *from = saved_command_line;
 671        int len = 0;
 672        int userdef = 0;
 673
 674        /* Save unparsed command line copy for /proc/cmdline */
 675        saved_command_line[COMMAND_LINE_SIZE-1] = '';
 676
 677        for (;;) {
 678                if (c != ' ')
 679                        goto next_char;
 680                /*
 681                 * "mem=nopentium" disables the 4MB page tables.
 682                 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
 683                 * to <mem>, overriding the bios size.
 684                 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
 685                 * <start> to <start>+<mem>, overriding the bios size.
 686                 *
 687                 * HPA tells me bootloaders need to parse mem=, so no new
 688                 * option should be mem=  [also see Documentation/i386/boot.txt]
 689                 */
 690                if (!memcmp(from, "mem=", 4)) {
 691                        if (to != command_line)
 692                                to--;
 693                        if (!memcmp(from+4, "nopentium", 9)) {
 694                                from += 9+4;
 695                                clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
 696                                disable_pse = 1;
 697                        } else {
 698                                /* If the user specifies memory size, we
 699                                 * limit the BIOS-provided memory map to
 700                                 * that size. exactmap can be used to specify
 701                                 * the exact map. mem=number can be used to
 702                                 * trim the existing memory map.
 703                                 */
 704                                unsigned long long mem_size;
 705
 706                                mem_size = memparse(from+4, &from);
 707                                limit_regions(mem_size);
 708                                userdef=1;
 709                        }
 710                }
 711
 712                else if (!memcmp(from, "memmap=", 7)) {
 713                        if (to != command_line)
 714                                to--;
 715                        if (!memcmp(from+7, "exactmap", 8)) {
 716                                from += 8+7;
 717                                e820.nr_map = 0;
 718                                userdef = 1;
 719                        } else {
 720                                /* If the user specifies memory size, we
 721                                 * limit the BIOS-provided memory map to
 722                                 * that size. exactmap can be used to specify
 723                                 * the exact map. mem=number can be used to
 724                                 * trim the existing memory map.
 725                                 */
 726                                unsigned long long start_at, mem_size;
 727
 728                                mem_size = memparse(from+7, &from);
 729                                if (*from == '@') {
 730                                        start_at = memparse(from+1, &from);
 731                                        add_memory_region(start_at, mem_size, E820_RAM);
 732                                } else if (*from == '#') {
 733                                        start_at = memparse(from+1, &from);
 734                                        add_memory_region(start_at, mem_size, E820_ACPI);
 735                                } else if (*from == '$') {
 736                                        start_at = memparse(from+1, &from);
 737                                        add_memory_region(start_at, mem_size, E820_RESERVED);
 738                                } else {
 739                                        limit_regions(mem_size);
 740                                        userdef=1;
 741                                }
 742                        }
 743                }
 744
 745                else if (!memcmp(from, "noexec=", 7))
 746                        noexec_setup(from + 7);
 747
 748
 749#ifdef  CONFIG_X86_SMP
 750                /*
 751                 * If the BIOS enumerates physical processors before logical,
 752                 * maxcpus=N at enumeration-time can be used to disable HT.
 753                 */
 754                else if (!memcmp(from, "maxcpus=", 8)) {
 755                        extern unsigned int maxcpus;
 756
 757                        maxcpus = simple_strtoul(from + 8, NULL, 0);
 758                }
 759#endif
 760
 761#ifdef CONFIG_ACPI_BOOT
 762                /* "acpi=off" disables both ACPI table parsing and interpreter */
 763                else if (!memcmp(from, "acpi=off", 8)) {
 764                        disable_acpi();
 765                }
 766
 767                /* acpi=force to over-ride black-list */
 768                else if (!memcmp(from, "acpi=force", 10)) {
 769                        acpi_force = 1;
 770                        acpi_ht = 1;
 771                        acpi_disabled = 0;
 772                }
 773
 774                /* acpi=strict disables out-of-spec workarounds */
 775                else if (!memcmp(from, "acpi=strict", 11)) {
 776                        acpi_strict = 1;
 777                }
 778
 779                /* Limit ACPI just to boot-time to enable HT */
 780                else if (!memcmp(from, "acpi=ht", 7)) {
 781                        if (!acpi_force)
 782                                disable_acpi();
 783                        acpi_ht = 1;
 784                }
 785
 786                /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
 787                else if (!memcmp(from, "pci=noacpi", 10)) {
 788                        acpi_disable_pci();
 789                }
 790                /* "acpi=noirq" disables ACPI interrupt routing */
 791                else if (!memcmp(from, "acpi=noirq", 10)) {
 792                        acpi_noirq_set();
 793                }
 794
 795                else if (!memcmp(from, "acpi_sci=edge", 13))
 796                        acpi_sci_flags.trigger =  1;
 797
 798                else if (!memcmp(from, "acpi_sci=level", 14))
 799                        acpi_sci_flags.trigger = 3;
 800
 801                else if (!memcmp(from, "acpi_sci=high", 13))
 802                        acpi_sci_flags.polarity = 1;
 803
 804                else if (!memcmp(from, "acpi_sci=low", 12))
 805                        acpi_sci_flags.polarity = 3;
 806
 807#ifdef CONFIG_X86_IO_APIC
 808                else if (!memcmp(from, "acpi_skip_timer_override", 24))
 809                        acpi_skip_timer_override = 1;
 810#endif
 811
 812#ifdef CONFIG_X86_LOCAL_APIC
 813                /* disable IO-APIC */
 814                else if (!memcmp(from, "noapic", 6))
 815                        disable_ioapic_setup();
 816#endif /* CONFIG_X86_LOCAL_APIC */
 817#endif /* CONFIG_ACPI_BOOT */
 818
 819                /*
 820                 * highmem=size forces highmem to be exactly 'size' bytes.
 821                 * This works even on boxes that have no highmem otherwise.
 822                 * This also works to reduce highmem size on bigger boxes.
 823                 */
 824                else if (!memcmp(from, "highmem=", 8))
 825                        highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
 826
 827                /*
 828                 * vmalloc=size forces the vmalloc area to be exactly 'size'
 829                 * bytes. This can be used to increase (or decrease) the
 830                 * vmalloc area - the default is 128m.
 831                 */
 832                else if (!memcmp(from, "vmalloc=", 8))
 833                        __VMALLOC_RESERVE = memparse(from+8, &from);
 834
 835        next_char:
 836                c = *(from++);
 837                if (!c)
 838                        break;
 839                if (COMMAND_LINE_SIZE <= ++len)
 840                        break;
 841                *(to++) = c;
 842        }
 843        *to = '';
 844        *cmdline_p = command_line;
 845        if (userdef) {
 846                printk(KERN_INFO "user-defined physical RAM map:\n");
 847                print_memory_map("user");
 848        }
 849}

Written by Sungju

March 24, 2009 at 2:04 am

Posted in OS Basic

Tagged with

Why free() does not return allocated physical memory to the OS?

leave a comment »

If your application have to fight with huge amount of memory resources, you will find this unexpected result sometimes.

Even though you free()ed every memory into the Kernel, your process still holding that memory when you check with vmstat or free command.

It can be act differently among different Linux distributions and different library set. But, normally you can see this.

It’s happen because of optimization inside malloc() library. They usually does not return memory because it is possible to use this areas again. So, malloc library keept it for later use. But, if you just use those memory once and never use that again, it will be waste your memory and sometimes reduce the performance.

You can reclaim to actually freeing the memory by calling malloc_trim() API.

Written by Sungju

March 2, 2009 at 3:01 am

Posted in Memory

Meaning of the ’sysctl.max_lock_depth’

leave a comment »

This tunable limits the amount of deadlock-checking the kernel will do. The default value is 1024.

You can see those codes in the following snippet.

 146/*
 147 * Max number of times we'll walk the boosting chain:
 148 */
 149int max_lock_depth = 1024;
 150
 151/*
 152 * Adjust the priority chain. Also used for deadlock detection.
 153 * Decreases task's usage by one - may thus free the task.
 154 * Returns 0 or -EDEADLK.
 155 */
 156static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 157                                      int deadlock_detect,
 158                                      struct rt_mutex *orig_lock,
 159                                      struct rt_mutex_waiter *orig_waiter,
 160                                      struct task_struct *top_task)
 161{
 162        struct rt_mutex *lock;
 163        struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
 164        int detect_deadlock, ret = 0, depth = 0;
 165        unsigned long flags;
 166
 167        detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
 168                                                         deadlock_detect);
 169
 170        /*
 171         * The (de)boosting is a step by step approach with a lot of
 172         * pitfalls. We want this to be preemptible and we want hold a
 173         * maximum of two locks per step. So we have to check
 174         * carefully whether things change under us.
 175         */
 176 again:
 177        if (++depth > max_lock_depth) {
 178                static int prev_max;
 179
 180                /*
 181                 * Print this only once. If the admin changes the limit,
 182                 * print a new message when reaching the limit again.
 183                 */
 184                if (prev_max != max_lock_depth) {
 185                        prev_max = max_lock_depth;
 186                        printk(KERN_WARNING "Maximum lock depth %d reached "
 187                               "task: %s (%d)\n", max_lock_depth,
 188                               top_task->comm, task_pid_nr(top_task));
 189                }
 190                put_task_struct(task);
 191
 192                return deadlock_detect ? -EDEADLK : 0;
 193        }
 194 retry:
 195        /*

Written by Sungju

January 14, 2009 at 6:00 am

Posted in Tuning

Are malloc() and free() functions thread-safe?

leave a comment »

Recently, someone asked me why he can’t allocated enough memory with a lot of thread functions.

At first, I just tried to explain that there is no problem in Linux kernel or malloc() functions. But, when double checked his code, I found that he used malloc() without any safeguard. Yeah, no synchronization around malloc(). Sigh.

malloc() and free() functions are use static data structures which can cause the synchronization problem in multi-threaded application. They are not thread-safe. You will be better to make your own function to allocate and deallocate for memory and put some synchronization code into that functions. That will be helpful.

There are some web documents that talk about malloc() and free(). One of them is this: http://www.ibm.com/developerworks/linux/library/l-reent.html

Additional Note: In RHEL products, they use ptmalloc() which is thread-safe version of malloc(). So, it does not make any problem within thread application by itself.

Written by Sungju

January 7, 2009 at 6:47 am

Posted in Memory

Tagged with ,

Meaning of /proc/stat

leave a comment »

What are the meanings of each field in /proc/stat file?

Some documents said that each field represented in jiffies, but actually it isn’t. It was true on kernel 2.4, but no more in kernel 2.6. We can easily see the meaning of each field at the kernel source.

http://lxr.linux.no/linux+v2.6.27.10/fs/proc/proc_misc.c

static int show_stat(struct seq_file *p, void *v)
 504{
 505        int i;
 506        unsigned long jif;
 507        cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
 508        cputime64_t guest;
 509        u64 sum = 0;
 510        struct timespec boottime;
 511        unsigned int *per_irq_sum;
 512
 513        per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL);
 514        if (!per_irq_sum)
 515                return -ENOMEM;
 516
 517        user = nice = system = idle = iowait =
 518                irq = softirq = steal = cputime64_zero;
 519        guest = cputime64_zero;
 520        getboottime(&boottime);
 521        jif = boottime.tv_sec;
 522
 523        for_each_possible_cpu(i) {
 524                int j;
 525
 526                user = cputime64_add(user, kstat_cpu(i).cpustat.user);
 527                nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
 528                system = cputime64_add(system, kstat_cpu(i).cpustat.system);
 529                idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
 530                iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
 531                irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 532                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 533                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 534                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 535                for (j = 0; j < NR_IRQS; j++) {
 536                        unsigned int temp = kstat_cpu(i).irqs[j];
 537                        sum += temp;
 538                        per_irq_sum[j] += temp;
 539                }
 540                sum += arch_irq_stat_cpu(i);
 541        }
 542        sum += arch_irq_stat();
 543
 544        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 545                (unsigned long long)cputime64_to_clock_t(user),
 546                (unsigned long long)cputime64_to_clock_t(nice),
 547                (unsigned long long)cputime64_to_clock_t(system),
 548                (unsigned long long)cputime64_to_clock_t(idle),
 549                (unsigned long long)cputime64_to_clock_t(iowait),
 550                (unsigned long long)cputime64_to_clock_t(irq),
 551                (unsigned long long)cputime64_to_clock_t(softirq),
 552                (unsigned long long)cputime64_to_clock_t(steal),
 553                (unsigned long long)cputime64_to_clock_t(guest));
 554        for_each_online_cpu(i) {
 555
 556                /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
 557                user = kstat_cpu(i).cpustat.user;
 558                nice = kstat_cpu(i).cpustat.nice;
 559                system = kstat_cpu(i).cpustat.system;
 560                idle = kstat_cpu(i).cpustat.idle;
 561                iowait = kstat_cpu(i).cpustat.iowait;
 562                irq = kstat_cpu(i).cpustat.irq;
 563                softirq = kstat_cpu(i).cpustat.softirq;
 564                steal = kstat_cpu(i).cpustat.steal;
 565                guest = kstat_cpu(i).cpustat.guest;
 566                seq_printf(p,
 567                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 568                        i,
 569                        (unsigned long long)cputime64_to_clock_t(user),
 570                        (unsigned long long)cputime64_to_clock_t(nice),
 571                        (unsigned long long)cputime64_to_clock_t(system),
 572                        (unsigned long long)cputime64_to_clock_t(idle),
 573                        (unsigned long long)cputime64_to_clock_t(iowait),
 574                        (unsigned long long)cputime64_to_clock_t(irq),
 575                        (unsigned long long)cputime64_to_clock_t(softirq),
 576                        (unsigned long long)cputime64_to_clock_t(steal),
 577                        (unsigned long long)cputime64_to_clock_t(guest));
 578        }
 579        seq_printf(p, "intr %llu", (unsigned long long)sum);
 580
 581        for (i = 0; i < NR_IRQS; i++)
 582                seq_printf(p, " %u", per_irq_sum[i]);
 583
 584        seq_printf(p,
 585                "\nctxt %llu\n"
 586                "btime %lu\n"
 587                "processes %lu\n"
 588                "procs_running %lu\n"
 589                "procs_blocked %lu\n",
 590                nr_context_switches(),
 591                (unsigned long)jif,
 592                total_forks,
 593                nr_running(),
 594                nr_iowait());
 595
 596        kfree(per_irq_sum);
 597        return 0;
 598}

From line 544 to 553, you can see the meaning of each field.

One little strange part is that the value which is calculated during the timer interrupt is converted to clock_t type using cputime64_to_clock_t() macro.

http://lxr.linux.no/linux+v2.6.27.10/include/asm-generic/cputime.h#L67

/*
  65 * Convert cputime64 to clock.
  66 */
  67#define cputime64_to_clock_t(__ct)      jiffies_64_to_clock_t(__ct)
  68

http://lxr.linux.no/linux+v2.6.27.10/kernel/time.c#L618

 618u64 jiffies_64_to_clock_t(u64 x)
 619{
 620#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
 621# if HZ  USER_HZ
 624        x = div_u64(x, HZ / USER_HZ);
 625# else
 626        /* Nothing to do */
 627# endif
 628#else
 629        /*
 630         * There are better ways that don't overflow early,
 631         * but even this doesn't overflow in hundreds of years
 632         * in 64 bits, so..
 633         */
 634        x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
 635#endif
 636        return x;
 637}
 638EXPORT_SYMBOL(jiffies_64_to_clock_t);

At the end, the internal jiffies value divided by difference between HZ and USER_HZ. The USER_HZ is 100. If HZ is 1000, internal jiffies value will be divided by 10. It means that we see the same result among 2.4 and 2.6, but it does not mean that those values just come from jiffies.

If we want to see the values in seconds, we can divide the value by CLOCKS_PER_SEC macro which is defined in time.h.

Written by Sungju

December 24, 2008 at 12:17 pm

Posted in Process

Unix Top command

leave a comment »

If you are unix guys, you know well about ‘top’ command which display very useful information about the system usage. I also used this a lot during check the system states. But, in recent, one of my customer asked me about the difference between ‘CPU states:’ field and each process’s cpu usage. He asked why it is not matched.

I tried to figure it out by check the source code, but it’s still not clear. So, I googled about ‘top’ command. Because of too simple word ‘top’, it was not easy to find the correct documentation. I struggled a few days. :P

top-output.gif

But, finally I found the original site, ‘http://www.unixtop.org/‘.
They explained about this with very simple but clear way. If you have any interest about this field, you can check this web site’s ‘Documentation’ section.

Written by Sungju

December 10, 2008 at 2:02 am

Posted in OS Basic

What happened in my stack?

leave a comment »

If you write a program which requires a big chunk of memory at the time, you will notice the influences of ’stack size’.

I will show you what will happen in this stack limitation with the following simple program.

#include <stdio.h>
#include <malloc.h>

int main() {
	char *a;
	unsigned long i, half_gb = 512 * 1024 * 1024;
	unsigned long max = (unsigned long)3 * 1024 * 1024 * 1024;

	printf("PID = %d\n\n", getpid());
	for (i = half_gb; i < max; i += half_gb) {
		a = malloc(i);
		printf("a = %p, size = %uMB\n", a, i / (1024 * 1024));
		if (a != NULL) {
			fgets(a, i, stdin);
			free(a);
		}
	}
	return 0;
}

In Linux, you can change the stack size by ‘ulimit -s’ command. As a default, it has 10240 (10MB).

[root@localhost ~]# ulimit -a
core file size          (blocks, -c) 0
data seg size           (kbytes, -d) unlimited
scheduling priority             (-e) 0
file size               (blocks, -f) unlimited
pending signals                 (-i) 32768
max locked memory       (kbytes, -l) 32
max memory size         (kbytes, -m) unlimited
open files                      (-n) 1024
pipe size            (512 bytes, -p) 8
POSIX message queues     (bytes, -q) 819200
real-time priority              (-r) 0
stack size              (kbytes, -s) 10240
cpu time               (seconds, -t) unlimited
max user processes              (-u) 32768
virtual memory          (kbytes, -v) unlimited
file locks                      (-x) unlimited

As you can see, the stack size is ‘10240kbytes’. We can change it with ‘-s’ option. First, without changing anything I will run the above application.

[root@localhost ~]# ./test
PID = 7865

a = 0x97f9e008, size = 512MB

a = 0x77f9e008, size = 1024MB

a = 0x57f9e008, size = 1536MB

a = 0x37f9e008, size = 2048MB
a = 0x17f9e008, size = 2560MB

And the memory layout will be something like this:

[root@localhost ~]# pmap -x 7865
7865:   ./test
Address   Kbytes     RSS    Anon  Locked Mode   Mapping
004c9000     104       -       -       - r-x--  ld-2.5.so
004e3000       4       -       -       - r-x--  ld-2.5.so
004e4000       4       -       -       - rwx--  ld-2.5.so
004e7000    1268       -       -       - r-x--  libc-2.5.so
00624000       8       -       -       - r-x--  libc-2.5.so
00626000       4       -       -       - rwx--  libc-2.5.so
00627000      12       -       -       - rwx--    [ anon ]
00f43000       4       -       -       - r-x--    [ anon ]
08048000       4       -       -       - r-x--  test
08049000       4       -       -       - rw---  test
57f9e000 1572876       -       -       - rw---    [ anon ]
b7fae000       8       -       -       - rw---    [ anon ]
bfaac000      84       -       -       - rw---    [ stack ]
-------- ------- ------- ------- -------
total kB 1574384       -       -       -

You can see the malloc() memory started from the 77fc9000. It looks normal and looks nothing special in this layout. But, you will find something difference soon.

If you change the stack size with something like this:

[root@localhost ~]# ulimit -s unlimited
[root@localhost ~]# ulimit -a
core file size          (blocks, -c) 0
data seg size           (kbytes, -d) unlimited
scheduling priority             (-e) 0
file size               (blocks, -f) unlimited
pending signals                 (-i) 32768
max locked memory       (kbytes, -l) 32
max memory size         (kbytes, -m) unlimited
open files                      (-n) 1024
pipe size            (512 bytes, -p) 8
POSIX message queues     (bytes, -q) 819200
real-time priority              (-r) 0
stack size              (kbytes, -s) unlimited
cpu time               (seconds, -t) unlimited
max user processes              (-u) 32768
virtual memory          (kbytes, -v) unlimited
file locks                      (-x) unlimited

And run the same application again.

[root@localhost ~]# ./test
PID = 7791

a = 0x40012008, size = 512MB

a = 0x40012008, size = 1024MB

a = 0x40012008, size = 1536MB

a = (nil), size = 2048MB
a = (nil), size = 2560MB

It failed when I request 2G of memory which was succeed in the previous test. When I saw the memory layout, it was different.

[root@localhost ~]# pmap -x 7791
7791:   ./test
Address   Kbytes     RSS    Anon  Locked Mode   Mapping
004c9000     104       -       -       - r-x--  ld-2.5.so
004e3000       4       -       -       - r-x--  ld-2.5.so
004e4000       4       -       -       - rwx--  ld-2.5.so
004e7000    1268       -       -       - r-x--  libc-2.5.so
00624000       8       -       -       - r-x--  libc-2.5.so
00626000       4       -       -       - rwx--  libc-2.5.so
00627000      12       -       -       - rwx--    [ anon ]
08048000       4       -       -       - r-x--  test
08049000       4       -       -       - rwx--  test
40000000       4       -       -       - r-x--    [ anon ]
40001000       8       -       -       - rw---    [ anon ]
40010000 1572876       -       -       - rw---    [ anon ]
bf87a000      88       -       -       - rw---    [ stack ]
-------- ------- ------- ------- -------
total kB 1574388       -       -       -

The main difference is that with ‘unlimited’ option, we can see much more allocated memory junk. You can see who use the other memory junks except the one allocated by malloc. As you can see in the following code block, the main reason is ‘[vdso]‘. Because it is located at the fixed location, every memory allocation had to start later on. So, possible memory allocation must done after that address. It reduces the available memory sizes.

40000000-40001000 r-xp 40000000 00:00 0          [vdso]
40001000-40003000 rw-p 40001000 00:00 0
40010000-80013000 rw-p 40010000 00:00 0
bffde000-bfff3000 rw-p bffde000 00:00 0          [stack]

The main problem is that ‘[vdso]‘ inserted into the middle of the heap when stack size is set to ‘unlimited’. [vdso] is the memory space which is used for fast system call mechanism. With this memory area, we don’t need to call interrupt, just access those memory range. So, it is much faster than interrupt call.

I tried to find out the reason why this location becomes different on different stack size. And I could found it on the arch_pick_mmap_layout(..) function.

  33/*
  34 * Top of mmap area (just below the process stack).
  35 *
  36 * Leave an at least ~128 MB hole.
  37 */
  38#define MIN_GAP (128*1024*1024)
  39#define MAX_GAP (TASK_SIZE/6*5)
  40
  41/*
  42 * True on X86_32 or when emulating IA32 on X86_64
  43 */
  44static int mmap_is_ia32(void)
  45{
  46#ifdef CONFIG_X86_32
  47        return 1;
  48#endif
  49#ifdef CONFIG_IA32_EMULATION
  50        if (test_thread_flag(TIF_IA32))
  51                return 1;
  52#endif
  53        return 0;
  54}
  55
  56static int mmap_is_legacy(void)
  57{
  58        if (current->personality & ADDR_COMPAT_LAYOUT)
  59                return 1;
  60
  61        if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
  62                return 1;
  63
  64        return sysctl_legacy_va_layout;
  65}
  66
  67static unsigned long mmap_rnd(void)
  68{
  69        unsigned long rnd = 0;
  70
  71        /*
  72        *  8 bits of randomness in 32bit mmaps, 20 address space bits
  73        * 28 bits of randomness in 64bit mmaps, 40 address space bits
  74        */
  75        if (current->flags & PF_RANDOMIZE) {
  76                if (mmap_is_ia32())
  77                        rnd = (long)get_random_int() % (1<<8);
  78                else
  79                        rnd = (long)(get_random_int() % (1<<28));
  80        }
  81        return rnd <signal->rlim[RLIMIT_STACK].rlim_cur;
  87
  88        if (gap  MAX_GAP)
  91                gap = MAX_GAP;
  92
  93        return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
  94}
  95
  96/*
  97 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
  98 * does, but not when emulating X86_32
  99 */
 100static unsigned long mmap_legacy_base(void)
 101{
 102        if (mmap_is_ia32())
 103                return TASK_UNMAPPED_BASE;
 104        else
 105                return TASK_UNMAPPED_BASE + mmap_rnd();
 106}
 107
 108/*
 109 * This function, called very early during the creation of a new
 110 * process VM image, sets up which VM layout function to use:
 111 */
 112void arch_pick_mmap_layout(struct mm_struct *mm)
 113{
 114        if (mmap_is_legacy()) {
 115                mm->mmap_base = mmap_legacy_base();
 116                mm->get_unmapped_area = arch_get_unmapped_area;
 117                mm->unmap_area = arch_unmap_area;
 118        } else {
 119                mm->mmap_base = mmap_base();
 120                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 121                mm->unmap_area = arch_unmap_area_topdown;
 122        }
 123}
 124

mmap_is_legacy() will return TRUE if stack size is set to unlimited (or some other reasons which will not explain on here). If mmap_is_leagacy() return TRUE, mm->mmap_base is set to TASK_UNMAPPED_BASE(1GB) in i386 box. This is the start address of heap and vdso just located on here. So, problem(?) happens.

Written by Sungju

November 25, 2008 at 1:03 pm

Posted in Memory

Tagged with , , ,